xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll (revision 7457f51f6cf61b960e3e6e45e63378debd5c1d5c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
19; SSE-LABEL: load_i32_stride6_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movdqa (%rdi), %xmm1
23; SSE-NEXT:    movdqa 16(%rdi), %xmm0
24; SSE-NEXT:    movdqa 32(%rdi), %xmm2
25; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
26; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
27; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
28; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3]
29; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
30; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
31; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
32; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
33; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
34; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
35; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
36; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
37; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
38; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
39; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
40; SSE-NEXT:    movq %xmm1, (%rsi)
41; SSE-NEXT:    movq %xmm4, (%rdx)
42; SSE-NEXT:    movq %xmm5, (%rcx)
43; SSE-NEXT:    movq %xmm6, (%r8)
44; SSE-NEXT:    movq %xmm0, (%r9)
45; SSE-NEXT:    movq %xmm7, (%rax)
46; SSE-NEXT:    retq
47;
48; AVX-LABEL: load_i32_stride6_vf2:
49; AVX:       # %bb.0:
50; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
51; AVX-NEXT:    vmovaps (%rdi), %xmm0
52; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
53; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
54; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3]
55; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3]
56; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
57; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
58; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3]
59; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
60; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
61; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
62; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3]
63; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
64; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
65; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
66; AVX-NEXT:    vmovlps %xmm3, (%rsi)
67; AVX-NEXT:    vmovlps %xmm4, (%rdx)
68; AVX-NEXT:    vmovlps %xmm5, (%rcx)
69; AVX-NEXT:    vmovlps %xmm0, (%r8)
70; AVX-NEXT:    vmovlps %xmm6, (%r9)
71; AVX-NEXT:    vmovlps %xmm1, (%rax)
72; AVX-NEXT:    retq
73;
74; AVX2-LABEL: load_i32_stride6_vf2:
75; AVX2:       # %bb.0:
76; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
77; AVX2-NEXT:    vmovaps 32(%rdi), %ymm0
78; AVX2-NEXT:    vmovaps (%rdi), %xmm1
79; AVX2-NEXT:    vmovaps 16(%rdi), %xmm2
80; AVX2-NEXT:    vmovaps 32(%rdi), %xmm3
81; AVX2-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
82; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
83; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
84; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
85; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
86; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
87; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
88; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
89; AVX2-NEXT:    vmovsd {{.*#+}} xmm3 = [4,2,0,0]
90; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
91; AVX2-NEXT:    vpermps %ymm0, %ymm3, %ymm3
92; AVX2-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
93; AVX2-NEXT:    vpermps %ymm0, %ymm6, %ymm0
94; AVX2-NEXT:    vmovlps %xmm4, (%rsi)
95; AVX2-NEXT:    vmovlps %xmm2, (%rdx)
96; AVX2-NEXT:    vmovlps %xmm5, (%rcx)
97; AVX2-NEXT:    vmovlps %xmm1, (%r8)
98; AVX2-NEXT:    vmovlps %xmm3, (%r9)
99; AVX2-NEXT:    vmovlps %xmm0, (%rax)
100; AVX2-NEXT:    vzeroupper
101; AVX2-NEXT:    retq
102;
103; AVX2-FP-LABEL: load_i32_stride6_vf2:
104; AVX2-FP:       # %bb.0:
105; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
106; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm0
107; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm1
108; AVX2-FP-NEXT:    vmovaps 16(%rdi), %xmm2
109; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm3
110; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
111; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
112; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
113; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
114; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
115; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
116; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
117; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
118; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm3 = [4,2,0,0]
119; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
120; AVX2-FP-NEXT:    vpermps %ymm0, %ymm3, %ymm3
121; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
122; AVX2-FP-NEXT:    vpermps %ymm0, %ymm6, %ymm0
123; AVX2-FP-NEXT:    vmovlps %xmm4, (%rsi)
124; AVX2-FP-NEXT:    vmovlps %xmm2, (%rdx)
125; AVX2-FP-NEXT:    vmovlps %xmm5, (%rcx)
126; AVX2-FP-NEXT:    vmovlps %xmm1, (%r8)
127; AVX2-FP-NEXT:    vmovlps %xmm3, (%r9)
128; AVX2-FP-NEXT:    vmovlps %xmm0, (%rax)
129; AVX2-FP-NEXT:    vzeroupper
130; AVX2-FP-NEXT:    retq
131;
132; AVX2-FCP-LABEL: load_i32_stride6_vf2:
133; AVX2-FCP:       # %bb.0:
134; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
135; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm0
136; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm1
137; AVX2-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
138; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm3
139; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
140; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
141; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
142; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
143; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
144; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
145; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
146; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
147; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm3 = [4,2,0,0]
148; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
149; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm3, %ymm3
150; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
151; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm6, %ymm0
152; AVX2-FCP-NEXT:    vmovlps %xmm4, (%rsi)
153; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rdx)
154; AVX2-FCP-NEXT:    vmovlps %xmm5, (%rcx)
155; AVX2-FCP-NEXT:    vmovlps %xmm1, (%r8)
156; AVX2-FCP-NEXT:    vmovlps %xmm3, (%r9)
157; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rax)
158; AVX2-FCP-NEXT:    vzeroupper
159; AVX2-FCP-NEXT:    retq
160;
161; AVX512-LABEL: load_i32_stride6_vf2:
162; AVX512:       # %bb.0:
163; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
164; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
165; AVX512-NEXT:    vmovaps 16(%rdi), %xmm1
166; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
167; AVX512-NEXT:    vextractps $2, %xmm1, %r10d
168; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm3
169; AVX512-NEXT:    vextractps $3, %xmm1, %r10d
170; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
171; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm1, %xmm1
172; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm4
173; AVX512-NEXT:    vmovd %xmm2, %r10d
174; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
175; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
176; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
177; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
178; AVX512-NEXT:    vmovaps 32(%rdi), %ymm5
179; AVX512-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
180; AVX512-NEXT:    vpermps %ymm5, %ymm2, %ymm2
181; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
182; AVX512-NEXT:    vpermps %ymm5, %ymm6, %ymm5
183; AVX512-NEXT:    vmovq %xmm3, (%rsi)
184; AVX512-NEXT:    vmovq %xmm1, (%rdx)
185; AVX512-NEXT:    vmovq %xmm4, (%rcx)
186; AVX512-NEXT:    vmovq %xmm0, (%r8)
187; AVX512-NEXT:    vmovlps %xmm2, (%r9)
188; AVX512-NEXT:    vmovlps %xmm5, (%rax)
189; AVX512-NEXT:    vzeroupper
190; AVX512-NEXT:    retq
191;
192; AVX512-FCP-LABEL: load_i32_stride6_vf2:
193; AVX512-FCP:       # %bb.0:
194; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
195; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
196; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm1
197; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
198; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
199; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
200; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
201; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
202; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
203; AVX512-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
204; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
205; AVX512-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm6
206; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
207; AVX512-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
208; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
209; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
210; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
211; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rsi)
212; AVX512-FCP-NEXT:    vmovlps %xmm2, (%rdx)
213; AVX512-FCP-NEXT:    vmovq %xmm3, (%rcx)
214; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
215; AVX512-FCP-NEXT:    vmovlps %xmm4, (%r9)
216; AVX512-FCP-NEXT:    vmovlps %xmm1, (%rax)
217; AVX512-FCP-NEXT:    vzeroupper
218; AVX512-FCP-NEXT:    retq
219;
220; AVX512DQ-LABEL: load_i32_stride6_vf2:
221; AVX512DQ:       # %bb.0:
222; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
223; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
224; AVX512DQ-NEXT:    vmovaps 16(%rdi), %xmm1
225; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
226; AVX512DQ-NEXT:    vextractps $2, %xmm1, %r10d
227; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm3
228; AVX512DQ-NEXT:    vextractps $3, %xmm1, %r10d
229; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
230; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm1, %xmm1
231; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm4
232; AVX512DQ-NEXT:    vmovd %xmm2, %r10d
233; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
234; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
235; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
236; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
237; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm5
238; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
239; AVX512DQ-NEXT:    vpermps %ymm5, %ymm2, %ymm2
240; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
241; AVX512DQ-NEXT:    vpermps %ymm5, %ymm6, %ymm5
242; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
243; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
244; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
245; AVX512DQ-NEXT:    vmovq %xmm0, (%r8)
246; AVX512DQ-NEXT:    vmovlps %xmm2, (%r9)
247; AVX512DQ-NEXT:    vmovlps %xmm5, (%rax)
248; AVX512DQ-NEXT:    vzeroupper
249; AVX512DQ-NEXT:    retq
250;
251; AVX512DQ-FCP-LABEL: load_i32_stride6_vf2:
252; AVX512DQ-FCP:       # %bb.0:
253; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
254; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
255; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm1
256; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
257; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
258; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
259; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
260; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
261; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
262; AVX512DQ-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
263; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
264; AVX512DQ-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm6
265; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
266; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
267; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
268; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
269; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
270; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rsi)
271; AVX512DQ-FCP-NEXT:    vmovlps %xmm2, (%rdx)
272; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rcx)
273; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
274; AVX512DQ-FCP-NEXT:    vmovlps %xmm4, (%r9)
275; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%rax)
276; AVX512DQ-FCP-NEXT:    vzeroupper
277; AVX512DQ-FCP-NEXT:    retq
278;
279; AVX512BW-LABEL: load_i32_stride6_vf2:
280; AVX512BW:       # %bb.0:
281; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
282; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
283; AVX512BW-NEXT:    vmovaps 16(%rdi), %xmm1
284; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
285; AVX512BW-NEXT:    vextractps $2, %xmm1, %r10d
286; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm3
287; AVX512BW-NEXT:    vextractps $3, %xmm1, %r10d
288; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
289; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm1, %xmm1
290; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
291; AVX512BW-NEXT:    vmovd %xmm2, %r10d
292; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
293; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
294; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
295; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
296; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm5
297; AVX512BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
298; AVX512BW-NEXT:    vpermps %ymm5, %ymm2, %ymm2
299; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
300; AVX512BW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
301; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
302; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
303; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
304; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
305; AVX512BW-NEXT:    vmovlps %xmm2, (%r9)
306; AVX512BW-NEXT:    vmovlps %xmm5, (%rax)
307; AVX512BW-NEXT:    vzeroupper
308; AVX512BW-NEXT:    retq
309;
310; AVX512BW-FCP-LABEL: load_i32_stride6_vf2:
311; AVX512BW-FCP:       # %bb.0:
312; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
313; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
314; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm1
315; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
316; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
317; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
318; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
319; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
320; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
321; AVX512BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
322; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
323; AVX512BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm6
324; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
325; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
326; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
327; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
328; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
329; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rsi)
330; AVX512BW-FCP-NEXT:    vmovlps %xmm2, (%rdx)
331; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
332; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
333; AVX512BW-FCP-NEXT:    vmovlps %xmm4, (%r9)
334; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
335; AVX512BW-FCP-NEXT:    vzeroupper
336; AVX512BW-FCP-NEXT:    retq
337;
338; AVX512DQ-BW-LABEL: load_i32_stride6_vf2:
339; AVX512DQ-BW:       # %bb.0:
340; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
341; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
342; AVX512DQ-BW-NEXT:    vmovaps 16(%rdi), %xmm1
343; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm2
344; AVX512DQ-BW-NEXT:    vextractps $2, %xmm1, %r10d
345; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm3
346; AVX512DQ-BW-NEXT:    vextractps $3, %xmm1, %r10d
347; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
348; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm1, %xmm1
349; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
350; AVX512DQ-BW-NEXT:    vmovd %xmm2, %r10d
351; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
352; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
353; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
354; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
355; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm5
356; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
357; AVX512DQ-BW-NEXT:    vpermps %ymm5, %ymm2, %ymm2
358; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
359; AVX512DQ-BW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
360; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
361; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
362; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
363; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
364; AVX512DQ-BW-NEXT:    vmovlps %xmm2, (%r9)
365; AVX512DQ-BW-NEXT:    vmovlps %xmm5, (%rax)
366; AVX512DQ-BW-NEXT:    vzeroupper
367; AVX512DQ-BW-NEXT:    retq
368;
369; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf2:
370; AVX512DQ-BW-FCP:       # %bb.0:
371; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
372; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
373; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm1
374; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
375; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
376; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
377; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
378; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
379; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
380; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
381; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
382; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm6
383; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
384; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
385; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
386; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
387; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
388; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rsi)
389; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm2, (%rdx)
390; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
391; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
392; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm4, (%r9)
393; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
394; AVX512DQ-BW-FCP-NEXT:    vzeroupper
395; AVX512DQ-BW-FCP-NEXT:    retq
396  %wide.vec = load <12 x i32>, ptr %in.vec, align 64
397  %strided.vec0 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 0, i32 6>
398  %strided.vec1 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 1, i32 7>
399  %strided.vec2 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 2, i32 8>
400  %strided.vec3 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 3, i32 9>
401  %strided.vec4 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 4, i32 10>
402  %strided.vec5 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 5, i32 11>
403  store <2 x i32> %strided.vec0, ptr %out.vec0, align 64
404  store <2 x i32> %strided.vec1, ptr %out.vec1, align 64
405  store <2 x i32> %strided.vec2, ptr %out.vec2, align 64
406  store <2 x i32> %strided.vec3, ptr %out.vec3, align 64
407  store <2 x i32> %strided.vec4, ptr %out.vec4, align 64
408  store <2 x i32> %strided.vec5, ptr %out.vec5, align 64
409  ret void
410}
411
412define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
413; SSE-LABEL: load_i32_stride6_vf4:
414; SSE:       # %bb.0:
415; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
416; SSE-NEXT:    movdqa 80(%rdi), %xmm1
417; SSE-NEXT:    movdqa 64(%rdi), %xmm0
418; SSE-NEXT:    movdqa (%rdi), %xmm6
419; SSE-NEXT:    movdqa 16(%rdi), %xmm2
420; SSE-NEXT:    movdqa 48(%rdi), %xmm3
421; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
422; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1]
423; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
424; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3]
425; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
426; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3]
427; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,1,1]
428; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
429; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1]
430; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
431; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
432; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3]
433; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
434; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
435; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1]
436; SSE-NEXT:    movdqa 32(%rdi), %xmm10
437; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1]
438; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
439; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1]
440; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
441; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1]
442; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm10[1,1,1,1]
443; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
444; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
445; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1]
446; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm10[2,3,2,3]
447; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1]
448; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
449; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,2,3,3]
450; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1]
451; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
452; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1]
453; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3]
454; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
455; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
456; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
457; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1]
458; SSE-NEXT:    movapd %xmm4, (%rsi)
459; SSE-NEXT:    movapd %xmm3, (%rdx)
460; SSE-NEXT:    movapd %xmm5, (%rcx)
461; SSE-NEXT:    movapd %xmm6, (%r8)
462; SSE-NEXT:    movapd %xmm9, (%r9)
463; SSE-NEXT:    movapd %xmm0, (%rax)
464; SSE-NEXT:    retq
465;
466; AVX-LABEL: load_i32_stride6_vf4:
467; AVX:       # %bb.0:
468; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
469; AVX-NEXT:    vmovaps 32(%rdi), %ymm0
470; AVX-NEXT:    vmovaps (%rdi), %ymm1
471; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
472; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
473; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3]
474; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,3]
475; AVX-NEXT:    vmovaps 64(%rdi), %xmm5
476; AVX-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[2]
477; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[3,0]
478; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[1,3]
479; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
480; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
481; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
482; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,3]
483; AVX-NEXT:    vmovaps 80(%rdi), %xmm6
484; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[0]
485; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[3,3]
486; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[1]
487; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
488; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[2,2,3,3]
489; AVX-NEXT:    vmovaps 16(%rdi), %xmm8
490; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
491; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3]
492; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[0,1,0,2]
493; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
494; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
495; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7]
496; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
497; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
498; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
499; AVX-NEXT:    vmovaps %xmm4, (%rsi)
500; AVX-NEXT:    vmovaps %xmm2, (%rdx)
501; AVX-NEXT:    vmovaps %xmm3, (%rcx)
502; AVX-NEXT:    vmovaps %xmm0, (%r8)
503; AVX-NEXT:    vmovaps %xmm7, (%r9)
504; AVX-NEXT:    vmovaps %xmm1, (%rax)
505; AVX-NEXT:    vzeroupper
506; AVX-NEXT:    retq
507;
508; AVX2-LABEL: load_i32_stride6_vf4:
509; AVX2:       # %bb.0:
510; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
511; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
512; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
513; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0]
514; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
515; AVX2-NEXT:    vpermd %ymm3, %ymm0, %ymm0
516; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm4
517; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2]
518; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
519; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0]
520; AVX2-NEXT:    vpermd %ymm3, %ymm5, %ymm3
521; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
522; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm5
523; AVX2-NEXT:    vpbroadcastd %xmm5, %xmm6
524; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
525; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm7[2,0,2,3,6,4,6,7]
526; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,3,2,3]
527; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
528; AVX2-NEXT:    vpbroadcastd 84(%rdi), %xmm8
529; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[3,1,3,3,7,5,7,7]
530; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3]
531; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
532; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm5[2,3]
533; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
534; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0]
535; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
536; AVX2-NEXT:    vpermd %ymm1, %ymm9, %ymm2
537; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
538; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
539; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3]
540; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
541; AVX2-NEXT:    vpermd %ymm1, %ymm5, %ymm1
542; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
543; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
544; AVX2-NEXT:    vmovdqa %xmm3, (%rdx)
545; AVX2-NEXT:    vmovdqa %xmm6, (%rcx)
546; AVX2-NEXT:    vmovdqa %xmm7, (%r8)
547; AVX2-NEXT:    vmovdqa %xmm2, (%r9)
548; AVX2-NEXT:    vmovdqa %xmm1, (%rax)
549; AVX2-NEXT:    vzeroupper
550; AVX2-NEXT:    retq
551;
552; AVX2-FP-LABEL: load_i32_stride6_vf4:
553; AVX2-FP:       # %bb.0:
554; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
555; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm1
556; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm2
557; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0]
558; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
559; AVX2-FP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
560; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm4
561; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2]
562; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
563; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0]
564; AVX2-FP-NEXT:    vpermd %ymm3, %ymm5, %ymm3
565; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
566; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm5
567; AVX2-FP-NEXT:    vpbroadcastd %xmm5, %xmm6
568; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
569; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm7[2,0,2,3,6,4,6,7]
570; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,3,2,3]
571; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
572; AVX2-FP-NEXT:    vpbroadcastd 84(%rdi), %xmm8
573; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[3,1,3,3,7,5,7,7]
574; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3]
575; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
576; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm5[2,3]
577; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
578; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0]
579; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
580; AVX2-FP-NEXT:    vpermd %ymm1, %ymm9, %ymm2
581; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
582; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
583; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3]
584; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
585; AVX2-FP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
586; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
587; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rsi)
588; AVX2-FP-NEXT:    vmovdqa %xmm3, (%rdx)
589; AVX2-FP-NEXT:    vmovdqa %xmm6, (%rcx)
590; AVX2-FP-NEXT:    vmovdqa %xmm7, (%r8)
591; AVX2-FP-NEXT:    vmovdqa %xmm2, (%r9)
592; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rax)
593; AVX2-FP-NEXT:    vzeroupper
594; AVX2-FP-NEXT:    retq
595;
596; AVX2-FCP-LABEL: load_i32_stride6_vf4:
597; AVX2-FCP:       # %bb.0:
598; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
599; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
600; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
601; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0]
602; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
603; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
604; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
605; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2]
606; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
607; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0]
608; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm5, %ymm3
609; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
610; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,0,6,7]
611; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
612; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm5, %ymm5
613; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm7
614; AVX2-FCP-NEXT:    vpbroadcastd %xmm7, %xmm8
615; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
616; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,1,7,7]
617; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm8, %ymm6
618; AVX2-FCP-NEXT:    vpbroadcastd 84(%rdi), %xmm8
619; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
620; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
621; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
622; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0]
623; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
624; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm9, %ymm2
625; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
626; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
627; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
628; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,3,0,0]
629; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm7, %ymm1
630; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
631; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
632; AVX2-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
633; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
634; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%r8)
635; AVX2-FCP-NEXT:    vmovdqa %xmm2, (%r9)
636; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rax)
637; AVX2-FCP-NEXT:    vzeroupper
638; AVX2-FCP-NEXT:    retq
639;
640; AVX512-LABEL: load_i32_stride6_vf4:
641; AVX512:       # %bb.0:
642; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
643; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
644; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
645; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18]
646; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
647; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19]
648; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
649; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20]
650; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
651; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21]
652; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
653; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22]
654; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
655; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23]
656; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
657; AVX512-NEXT:    vmovdqa %xmm2, (%rsi)
658; AVX512-NEXT:    vmovdqa %xmm3, (%rdx)
659; AVX512-NEXT:    vmovdqa %xmm4, (%rcx)
660; AVX512-NEXT:    vmovdqa %xmm5, (%r8)
661; AVX512-NEXT:    vmovdqa %xmm6, (%r9)
662; AVX512-NEXT:    vmovdqa %xmm7, (%rax)
663; AVX512-NEXT:    vzeroupper
664; AVX512-NEXT:    retq
665;
666; AVX512-FCP-LABEL: load_i32_stride6_vf4:
667; AVX512-FCP:       # %bb.0:
668; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
669; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
670; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
671; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18]
672; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
673; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19]
674; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
675; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20]
676; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
677; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21]
678; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
679; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22]
680; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
681; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23]
682; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
683; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
684; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
685; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
686; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%r8)
687; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r9)
688; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%rax)
689; AVX512-FCP-NEXT:    vzeroupper
690; AVX512-FCP-NEXT:    retq
691;
692; AVX512DQ-LABEL: load_i32_stride6_vf4:
693; AVX512DQ:       # %bb.0:
694; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
695; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
696; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm1
697; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18]
698; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
699; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19]
700; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
701; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20]
702; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
703; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21]
704; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
705; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22]
706; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
707; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23]
708; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
709; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rsi)
710; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rdx)
711; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rcx)
712; AVX512DQ-NEXT:    vmovdqa %xmm5, (%r8)
713; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r9)
714; AVX512DQ-NEXT:    vmovdqa %xmm7, (%rax)
715; AVX512DQ-NEXT:    vzeroupper
716; AVX512DQ-NEXT:    retq
717;
718; AVX512DQ-FCP-LABEL: load_i32_stride6_vf4:
719; AVX512DQ-FCP:       # %bb.0:
720; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
721; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
722; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
723; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18]
724; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
725; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19]
726; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
727; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20]
728; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
729; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21]
730; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
731; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22]
732; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
733; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23]
734; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
735; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
736; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
737; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
738; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%r8)
739; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r9)
740; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%rax)
741; AVX512DQ-FCP-NEXT:    vzeroupper
742; AVX512DQ-FCP-NEXT:    retq
743;
744; AVX512BW-LABEL: load_i32_stride6_vf4:
745; AVX512BW:       # %bb.0:
746; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
747; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
748; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
749; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18]
750; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
751; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19]
752; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
753; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20]
754; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
755; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21]
756; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
757; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22]
758; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
759; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23]
760; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
761; AVX512BW-NEXT:    vmovdqa %xmm2, (%rsi)
762; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
763; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
764; AVX512BW-NEXT:    vmovdqa %xmm5, (%r8)
765; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
766; AVX512BW-NEXT:    vmovdqa %xmm7, (%rax)
767; AVX512BW-NEXT:    vzeroupper
768; AVX512BW-NEXT:    retq
769;
770; AVX512BW-FCP-LABEL: load_i32_stride6_vf4:
771; AVX512BW-FCP:       # %bb.0:
772; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
773; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
774; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
775; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18]
776; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
777; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19]
778; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
779; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20]
780; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
781; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21]
782; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
783; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22]
784; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
785; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23]
786; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
787; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
788; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
789; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
790; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
791; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
792; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%rax)
793; AVX512BW-FCP-NEXT:    vzeroupper
794; AVX512BW-FCP-NEXT:    retq
795;
796; AVX512DQ-BW-LABEL: load_i32_stride6_vf4:
797; AVX512DQ-BW:       # %bb.0:
798; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
799; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
800; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
801; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18]
802; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
803; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19]
804; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
805; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20]
806; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
807; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21]
808; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
809; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22]
810; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
811; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23]
812; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
813; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rsi)
814; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
815; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
816; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%r8)
817; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
818; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, (%rax)
819; AVX512DQ-BW-NEXT:    vzeroupper
820; AVX512DQ-BW-NEXT:    retq
821;
822; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf4:
823; AVX512DQ-BW-FCP:       # %bb.0:
824; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
825; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
826; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
827; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18]
828; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
829; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19]
830; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
831; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20]
832; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
833; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21]
834; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
835; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22]
836; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
837; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23]
838; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
839; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
840; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
841; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
842; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
843; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
844; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%rax)
845; AVX512DQ-BW-FCP-NEXT:    vzeroupper
846; AVX512DQ-BW-FCP-NEXT:    retq
847  %wide.vec = load <24 x i32>, ptr %in.vec, align 64
848  %strided.vec0 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18>
849  %strided.vec1 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 1, i32 7, i32 13, i32 19>
850  %strided.vec2 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 2, i32 8, i32 14, i32 20>
851  %strided.vec3 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 3, i32 9, i32 15, i32 21>
852  %strided.vec4 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 4, i32 10, i32 16, i32 22>
853  %strided.vec5 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 5, i32 11, i32 17, i32 23>
854  store <4 x i32> %strided.vec0, ptr %out.vec0, align 64
855  store <4 x i32> %strided.vec1, ptr %out.vec1, align 64
856  store <4 x i32> %strided.vec2, ptr %out.vec2, align 64
857  store <4 x i32> %strided.vec3, ptr %out.vec3, align 64
858  store <4 x i32> %strided.vec4, ptr %out.vec4, align 64
859  store <4 x i32> %strided.vec5, ptr %out.vec5, align 64
860  ret void
861}
862
863define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
864; SSE-LABEL: load_i32_stride6_vf8:
865; SSE:       # %bb.0:
866; SSE-NEXT:    movdqa 144(%rdi), %xmm4
867; SSE-NEXT:    movdqa 160(%rdi), %xmm2
868; SSE-NEXT:    movdqa 96(%rdi), %xmm6
869; SSE-NEXT:    movdqa 112(%rdi), %xmm3
870; SSE-NEXT:    movdqa 64(%rdi), %xmm5
871; SSE-NEXT:    movdqa (%rdi), %xmm10
872; SSE-NEXT:    movdqa 16(%rdi), %xmm11
873; SSE-NEXT:    movdqa 48(%rdi), %xmm8
874; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
875; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
876; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
877; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm10[2,3,2,3]
878; SSE-NEXT:    movdqa %xmm10, %xmm7
879; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
880; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3]
881; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,1,1]
882; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
883; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1]
884; SSE-NEXT:    movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
885; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
886; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm6[1,1,1,1]
887; SSE-NEXT:    movdqa %xmm6, %xmm9
888; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
889; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
890; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,1,1]
891; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
892; SSE-NEXT:    movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1]
893; SSE-NEXT:    movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
894; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3]
895; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
896; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
897; SSE-NEXT:    movdqa %xmm5, %xmm9
898; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3]
899; SSE-NEXT:    movdqa %xmm8, %xmm11
900; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
901; SSE-NEXT:    movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
902; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
903; SSE-NEXT:    movdqa %xmm3, %xmm1
904; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
905; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
906; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
907; SSE-NEXT:    movdqa %xmm4, %xmm12
908; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
909; SSE-NEXT:    movdqa 80(%rdi), %xmm14
910; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1]
911; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1]
912; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
913; SSE-NEXT:    movdqa 32(%rdi), %xmm7
914; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1]
915; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1]
916; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
917; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3]
918; SSE-NEXT:    movdqa 176(%rdi), %xmm15
919; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1]
920; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
921; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
922; SSE-NEXT:    movdqa 128(%rdi), %xmm5
923; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
924; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
925; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3]
926; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[1,1,1,1]
927; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
928; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
929; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1]
930; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
931; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
932; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
933; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
934; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
935; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1]
936; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
937; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
938; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
939; SSE-NEXT:    movdqa %xmm3, %xmm10
940; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
941; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3]
942; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1]
943; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
944; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1]
945; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[2,3,2,3]
946; SSE-NEXT:    movdqa %xmm1, %xmm0
947; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
948; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3]
949; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm2[0,0,1,1]
950; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
951; SSE-NEXT:    movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1]
952; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
953; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3]
954; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
955; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3]
956; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
957; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
958; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
959; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
960; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
961; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
962; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3]
963; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
964; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
965; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
966; SSE-NEXT:    movaps %xmm0, 16(%rsi)
967; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
968; SSE-NEXT:    movaps %xmm0, (%rsi)
969; SSE-NEXT:    movapd %xmm12, 16(%rdx)
970; SSE-NEXT:    movapd %xmm11, (%rdx)
971; SSE-NEXT:    movapd %xmm13, 16(%rcx)
972; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
973; SSE-NEXT:    movaps %xmm0, (%rcx)
974; SSE-NEXT:    movapd %xmm4, 16(%r8)
975; SSE-NEXT:    movapd %xmm8, (%r8)
976; SSE-NEXT:    movapd %xmm10, 16(%r9)
977; SSE-NEXT:    movapd %xmm6, (%r9)
978; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
979; SSE-NEXT:    movapd %xmm2, 16(%rax)
980; SSE-NEXT:    movapd %xmm9, (%rax)
981; SSE-NEXT:    retq
982;
983; AVX-LABEL: load_i32_stride6_vf8:
984; AVX:       # %bb.0:
985; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
986; AVX-NEXT:    vmovapd 160(%rdi), %ymm3
987; AVX-NEXT:    vmovapd 128(%rdi), %ymm4
988; AVX-NEXT:    vmovaps 32(%rdi), %ymm6
989; AVX-NEXT:    vmovaps (%rdi), %ymm7
990; AVX-NEXT:    vmovaps 96(%rdi), %ymm0
991; AVX-NEXT:    vmovaps 64(%rdi), %ymm1
992; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm5
993; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm5[0,0],ymm0[6,4],ymm5[4,4]
994; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,2],ymm2[6,4],ymm5[6,6]
995; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
996; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm9
997; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm8[0,1],xmm9[2,3]
998; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,3]
999; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
1000; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1]
1001; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[1],ymm10[3],ymm4[2]
1002; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
1003; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
1004; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4]
1005; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm11[2,0],ymm5[2,3],ymm11[6,4],ymm5[6,7]
1006; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm9[3,0]
1007; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[1,3]
1008; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7]
1009; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm4[1,3],ymm10[7,5],ymm4[5,7]
1010; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
1011; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
1012; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3]
1013; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
1014; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4]
1015; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
1016; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm11
1017; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm7[2,0],xmm11[2,3]
1018; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4]
1019; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
1020; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
1021; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7]
1022; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5]
1023; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,1],xmm11[3,3]
1024; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5]
1025; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1]
1026; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
1027; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7]
1028; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3]
1029; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
1030; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4]
1031; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4]
1032; AVX-NEXT:    vmovaps 32(%rdi), %xmm9
1033; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm9[2,2,3,3]
1034; AVX-NEXT:    vmovaps 16(%rdi), %xmm11
1035; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
1036; AVX-NEXT:    vmovapd 80(%rdi), %xmm12
1037; AVX-NEXT:    vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[3]
1038; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4]
1039; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7]
1040; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
1041; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4]
1042; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4]
1043; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3]
1044; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
1045; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7]
1046; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4]
1047; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
1048; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
1049; AVX-NEXT:    vmovaps %ymm2, (%rsi)
1050; AVX-NEXT:    vmovaps %ymm5, (%rdx)
1051; AVX-NEXT:    vmovaps %ymm6, (%rcx)
1052; AVX-NEXT:    vmovaps %ymm7, (%r8)
1053; AVX-NEXT:    vmovaps %ymm8, (%r9)
1054; AVX-NEXT:    vmovaps %ymm0, (%rax)
1055; AVX-NEXT:    vzeroupper
1056; AVX-NEXT:    retq
1057;
1058; AVX2-LABEL: load_i32_stride6_vf8:
1059; AVX2:       # %bb.0:
1060; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1061; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
1062; AVX2-NEXT:    vmovaps 160(%rdi), %ymm1
1063; AVX2-NEXT:    vmovaps 96(%rdi), %ymm5
1064; AVX2-NEXT:    vmovaps (%rdi), %ymm2
1065; AVX2-NEXT:    vmovaps 32(%rdi), %ymm4
1066; AVX2-NEXT:    vmovaps 64(%rdi), %ymm6
1067; AVX2-NEXT:    vmovaps {{.*#+}} xmm3 = [0,6,4,u]
1068; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
1069; AVX2-NEXT:    vpermps %ymm7, %ymm3, %ymm3
1070; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1]
1071; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1072; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,2,2,2,4,6,6,6]
1073; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7]
1074; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1075; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2]
1076; AVX2-NEXT:    vpermps %ymm10, %ymm8, %ymm11
1077; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
1078; AVX2-NEXT:    vmovaps {{.*#+}} xmm11 = [1,7,5,u]
1079; AVX2-NEXT:    vpermps %ymm7, %ymm11, %ymm7
1080; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
1081; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
1082; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3]
1083; AVX2-NEXT:    vpermps %ymm10, %ymm9, %ymm10
1084; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
1085; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
1086; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
1087; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
1088; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
1089; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm11[2,0,2,3,6,4,6,7]
1090; AVX2-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,3,2,3]
1091; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7]
1092; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
1093; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm12[0,0,2,0,4,4,6,4]
1094; AVX2-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
1095; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
1096; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm5[3,3,3,3,7,7,7,7]
1097; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7]
1098; AVX2-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,2,0,3]
1099; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,1,3,3,7,5,7,7]
1100; AVX2-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,3,2,3]
1101; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7]
1102; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,7,5]
1103; AVX2-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
1104; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7]
1105; AVX2-NEXT:    vmovaps 80(%rdi), %xmm12
1106; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1107; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
1108; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6]
1109; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
1110; AVX2-NEXT:    vpermps %ymm2, %ymm8, %ymm4
1111; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7]
1112; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1113; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6]
1114; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
1115; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm1
1116; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
1117; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
1118; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7]
1119; AVX2-NEXT:    vpermps %ymm2, %ymm9, %ymm2
1120; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
1121; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
1122; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
1123; AVX2-NEXT:    vpermps %ymm0, %ymm4, %ymm0
1124; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
1125; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
1126; AVX2-NEXT:    vmovaps %ymm7, (%rdx)
1127; AVX2-NEXT:    vmovaps %ymm10, (%rcx)
1128; AVX2-NEXT:    vmovaps %ymm11, (%r8)
1129; AVX2-NEXT:    vmovaps %ymm1, (%r9)
1130; AVX2-NEXT:    vmovaps %ymm0, (%rax)
1131; AVX2-NEXT:    vzeroupper
1132; AVX2-NEXT:    retq
1133;
1134; AVX2-FP-LABEL: load_i32_stride6_vf8:
1135; AVX2-FP:       # %bb.0:
1136; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1137; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
1138; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm1
1139; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm5
1140; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm2
1141; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm4
1142; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm6
1143; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm3 = [0,6,4,u]
1144; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
1145; AVX2-FP-NEXT:    vpermps %ymm7, %ymm3, %ymm3
1146; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1]
1147; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1148; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,2,2,2,4,6,6,6]
1149; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7]
1150; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1151; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2]
1152; AVX2-FP-NEXT:    vpermps %ymm10, %ymm8, %ymm11
1153; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
1154; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm11 = [1,7,5,u]
1155; AVX2-FP-NEXT:    vpermps %ymm7, %ymm11, %ymm7
1156; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
1157; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
1158; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3]
1159; AVX2-FP-NEXT:    vpermps %ymm10, %ymm9, %ymm10
1160; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
1161; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
1162; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
1163; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
1164; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
1165; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm11[2,0,2,3,6,4,6,7]
1166; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,3,2,3]
1167; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7]
1168; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
1169; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm12[0,0,2,0,4,4,6,4]
1170; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
1171; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
1172; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm5[3,3,3,3,7,7,7,7]
1173; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7]
1174; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,2,0,3]
1175; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,1,3,3,7,5,7,7]
1176; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,3,2,3]
1177; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7]
1178; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,7,5]
1179; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
1180; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7]
1181; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm12
1182; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1183; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
1184; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6]
1185; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
1186; AVX2-FP-NEXT:    vpermps %ymm2, %ymm8, %ymm4
1187; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7]
1188; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1189; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6]
1190; AVX2-FP-NEXT:    # ymm1 = mem[0,1,0,1]
1191; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm1
1192; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
1193; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
1194; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7]
1195; AVX2-FP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
1196; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
1197; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
1198; AVX2-FP-NEXT:    # ymm4 = mem[0,1,0,1]
1199; AVX2-FP-NEXT:    vpermps %ymm0, %ymm4, %ymm0
1200; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
1201; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
1202; AVX2-FP-NEXT:    vmovaps %ymm7, (%rdx)
1203; AVX2-FP-NEXT:    vmovaps %ymm10, (%rcx)
1204; AVX2-FP-NEXT:    vmovaps %ymm11, (%r8)
1205; AVX2-FP-NEXT:    vmovaps %ymm1, (%r9)
1206; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
1207; AVX2-FP-NEXT:    vzeroupper
1208; AVX2-FP-NEXT:    retq
1209;
1210; AVX2-FCP-LABEL: load_i32_stride6_vf8:
1211; AVX2-FCP:       # %bb.0:
1212; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1213; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm0
1214; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm1
1215; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm5
1216; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm2
1217; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm4
1218; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm6
1219; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm3 = [0,6,4,u]
1220; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
1221; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm3, %ymm3
1222; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1]
1223; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1224; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,2,2,2,4,6,6,6]
1225; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7]
1226; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1227; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2]
1228; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm8, %ymm11
1229; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
1230; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm11 = [1,7,5,u]
1231; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm11, %ymm7
1232; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
1233; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
1234; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3]
1235; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm9, %ymm10
1236; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
1237; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
1238; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm11 = [2,4,2,4,2,4,2,4]
1239; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm11, %ymm10
1240; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm11 = [2,0,6,7]
1241; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
1242; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm11, %ymm11
1243; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7]
1244; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
1245; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm13 = [0,0,6,4,0,0,6,4]
1246; AVX2-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
1247; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm13, %ymm13
1248; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
1249; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm13 = [0,1,7,5,0,1,7,5]
1250; AVX2-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
1251; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm13, %ymm11
1252; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm13 = mem[3,3,3,3]
1253; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7]
1254; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7]
1255; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm13 = [3,1,7,5,0,u,u,u]
1256; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm13, %ymm12
1257; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7]
1258; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm12
1259; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1260; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
1261; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6]
1262; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
1263; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm8, %ymm4
1264; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7]
1265; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1266; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6]
1267; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
1268; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm1
1269; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
1270; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
1271; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7]
1272; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
1273; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
1274; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
1275; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
1276; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm4, %ymm0
1277; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
1278; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rsi)
1279; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rdx)
1280; AVX2-FCP-NEXT:    vmovaps %ymm10, (%rcx)
1281; AVX2-FCP-NEXT:    vmovaps %ymm11, (%r8)
1282; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r9)
1283; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
1284; AVX2-FCP-NEXT:    vzeroupper
1285; AVX2-FCP-NEXT:    retq
1286;
1287; AVX512-LABEL: load_i32_stride6_vf8:
1288; AVX512:       # %bb.0:
1289; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1290; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm0
1291; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm1
1292; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1293; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
1294; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm4
1295; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm5
1296; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm6
1297; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1298; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
1299; AVX512-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
1300; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
1301; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1302; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
1303; AVX512-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
1304; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
1305; AVX512-NEXT:    vpermd %zmm6, %zmm2, %zmm2
1306; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
1307; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1308; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
1309; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
1310; AVX512-NEXT:    vpermd %zmm6, %zmm3, %zmm3
1311; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
1312; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
1313; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
1314; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1315; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
1316; AVX512-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1317; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
1318; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
1319; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
1320; AVX512-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1321; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
1322; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
1323; AVX512-NEXT:    vmovdqa %ymm7, (%rsi)
1324; AVX512-NEXT:    vmovdqa %ymm8, (%rdx)
1325; AVX512-NEXT:    vmovdqa %ymm2, (%rcx)
1326; AVX512-NEXT:    vmovdqa %ymm3, (%r8)
1327; AVX512-NEXT:    vmovdqa %ymm6, (%r9)
1328; AVX512-NEXT:    vmovdqa %ymm4, (%rax)
1329; AVX512-NEXT:    vzeroupper
1330; AVX512-NEXT:    retq
1331;
1332; AVX512-FCP-LABEL: load_i32_stride6_vf8:
1333; AVX512-FCP:       # %bb.0:
1334; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1335; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
1336; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
1337; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1338; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
1339; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
1340; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
1341; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
1342; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1343; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
1344; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
1345; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
1346; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1347; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
1348; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
1349; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
1350; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
1351; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
1352; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1353; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
1354; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
1355; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
1356; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
1357; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
1358; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
1359; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1360; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
1361; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1362; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
1363; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
1364; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
1365; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1366; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
1367; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
1368; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
1369; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
1370; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
1371; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%r8)
1372; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r9)
1373; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rax)
1374; AVX512-FCP-NEXT:    vzeroupper
1375; AVX512-FCP-NEXT:    retq
1376;
1377; AVX512DQ-LABEL: load_i32_stride6_vf8:
1378; AVX512DQ:       # %bb.0:
1379; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1380; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm0
1381; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm1
1382; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1383; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
1384; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm4
1385; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm5
1386; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm6
1387; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1388; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
1389; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
1390; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
1391; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1392; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
1393; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
1394; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
1395; AVX512DQ-NEXT:    vpermd %zmm6, %zmm2, %zmm2
1396; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
1397; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1398; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
1399; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
1400; AVX512DQ-NEXT:    vpermd %zmm6, %zmm3, %zmm3
1401; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
1402; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
1403; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
1404; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1405; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
1406; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1407; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
1408; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
1409; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
1410; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1411; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
1412; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
1413; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rsi)
1414; AVX512DQ-NEXT:    vmovdqa %ymm8, (%rdx)
1415; AVX512DQ-NEXT:    vmovdqa %ymm2, (%rcx)
1416; AVX512DQ-NEXT:    vmovdqa %ymm3, (%r8)
1417; AVX512DQ-NEXT:    vmovdqa %ymm6, (%r9)
1418; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rax)
1419; AVX512DQ-NEXT:    vzeroupper
1420; AVX512DQ-NEXT:    retq
1421;
1422; AVX512DQ-FCP-LABEL: load_i32_stride6_vf8:
1423; AVX512DQ-FCP:       # %bb.0:
1424; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1425; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
1426; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
1427; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1428; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
1429; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
1430; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
1431; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
1432; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1433; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
1434; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
1435; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
1436; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1437; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
1438; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
1439; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
1440; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
1441; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
1442; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1443; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
1444; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
1445; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
1446; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
1447; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
1448; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
1449; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1450; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
1451; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1452; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
1453; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
1454; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
1455; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1456; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
1457; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
1458; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
1459; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
1460; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
1461; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%r8)
1462; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r9)
1463; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rax)
1464; AVX512DQ-FCP-NEXT:    vzeroupper
1465; AVX512DQ-FCP-NEXT:    retq
1466;
1467; AVX512BW-LABEL: load_i32_stride6_vf8:
1468; AVX512BW:       # %bb.0:
1469; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1470; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm0
1471; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm1
1472; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1473; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
1474; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm4
1475; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
1476; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm6
1477; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1478; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
1479; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
1480; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
1481; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1482; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
1483; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
1484; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
1485; AVX512BW-NEXT:    vpermd %zmm6, %zmm2, %zmm2
1486; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
1487; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1488; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
1489; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
1490; AVX512BW-NEXT:    vpermd %zmm6, %zmm3, %zmm3
1491; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
1492; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
1493; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
1494; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1495; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
1496; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1497; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
1498; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
1499; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
1500; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1501; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
1502; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
1503; AVX512BW-NEXT:    vmovdqa %ymm7, (%rsi)
1504; AVX512BW-NEXT:    vmovdqa %ymm8, (%rdx)
1505; AVX512BW-NEXT:    vmovdqa %ymm2, (%rcx)
1506; AVX512BW-NEXT:    vmovdqa %ymm3, (%r8)
1507; AVX512BW-NEXT:    vmovdqa %ymm6, (%r9)
1508; AVX512BW-NEXT:    vmovdqa %ymm4, (%rax)
1509; AVX512BW-NEXT:    vzeroupper
1510; AVX512BW-NEXT:    retq
1511;
1512; AVX512BW-FCP-LABEL: load_i32_stride6_vf8:
1513; AVX512BW-FCP:       # %bb.0:
1514; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1515; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
1516; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
1517; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1518; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
1519; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
1520; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
1521; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
1522; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1523; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
1524; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
1525; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
1526; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1527; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
1528; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
1529; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
1530; AVX512BW-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
1531; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
1532; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1533; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
1534; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
1535; AVX512BW-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
1536; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
1537; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
1538; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
1539; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1540; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
1541; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1542; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
1543; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
1544; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
1545; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1546; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
1547; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
1548; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
1549; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
1550; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
1551; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%r8)
1552; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r9)
1553; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rax)
1554; AVX512BW-FCP-NEXT:    vzeroupper
1555; AVX512BW-FCP-NEXT:    retq
1556;
1557; AVX512DQ-BW-LABEL: load_i32_stride6_vf8:
1558; AVX512DQ-BW:       # %bb.0:
1559; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1560; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm0
1561; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm1
1562; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1563; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
1564; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm4
1565; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
1566; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm6
1567; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1568; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
1569; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
1570; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
1571; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1572; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
1573; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
1574; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
1575; AVX512DQ-BW-NEXT:    vpermd %zmm6, %zmm2, %zmm2
1576; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
1577; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1578; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
1579; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
1580; AVX512DQ-BW-NEXT:    vpermd %zmm6, %zmm3, %zmm3
1581; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
1582; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
1583; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
1584; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1585; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
1586; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1587; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
1588; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
1589; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
1590; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1591; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
1592; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
1593; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%rsi)
1594; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%rdx)
1595; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%rcx)
1596; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%r8)
1597; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%r9)
1598; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rax)
1599; AVX512DQ-BW-NEXT:    vzeroupper
1600; AVX512DQ-BW-NEXT:    retq
1601;
1602; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf8:
1603; AVX512DQ-BW-FCP:       # %bb.0:
1604; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1605; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
1606; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
1607; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1608; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
1609; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
1610; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
1611; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
1612; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1613; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
1614; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
1615; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
1616; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1617; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
1618; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
1619; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
1620; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
1621; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
1622; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
1623; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
1624; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
1625; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
1626; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
1627; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
1628; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
1629; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
1630; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
1631; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1632; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
1633; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
1634; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
1635; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
1636; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
1637; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
1638; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
1639; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
1640; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
1641; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%r8)
1642; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r9)
1643; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rax)
1644; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1645; AVX512DQ-BW-FCP-NEXT:    retq
1646  %wide.vec = load <48 x i32>, ptr %in.vec, align 64
1647  %strided.vec0 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
1648  %strided.vec1 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43>
1649  %strided.vec2 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44>
1650  %strided.vec3 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45>
1651  %strided.vec4 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46>
1652  %strided.vec5 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47>
1653  store <8 x i32> %strided.vec0, ptr %out.vec0, align 64
1654  store <8 x i32> %strided.vec1, ptr %out.vec1, align 64
1655  store <8 x i32> %strided.vec2, ptr %out.vec2, align 64
1656  store <8 x i32> %strided.vec3, ptr %out.vec3, align 64
1657  store <8 x i32> %strided.vec4, ptr %out.vec4, align 64
1658  store <8 x i32> %strided.vec5, ptr %out.vec5, align 64
1659  ret void
1660}
1661
1662define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
1663; SSE-LABEL: load_i32_stride6_vf16:
1664; SSE:       # %bb.0:
1665; SSE-NEXT:    subq $408, %rsp # imm = 0x198
1666; SSE-NEXT:    movdqa 240(%rdi), %xmm9
1667; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1668; SSE-NEXT:    movdqa 256(%rdi), %xmm3
1669; SSE-NEXT:    movdqa 192(%rdi), %xmm10
1670; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1671; SSE-NEXT:    movdqa 208(%rdi), %xmm4
1672; SSE-NEXT:    movdqa 336(%rdi), %xmm14
1673; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1674; SSE-NEXT:    movdqa 352(%rdi), %xmm5
1675; SSE-NEXT:    movdqa 288(%rdi), %xmm15
1676; SSE-NEXT:    movdqa 304(%rdi), %xmm7
1677; SSE-NEXT:    movdqa 64(%rdi), %xmm12
1678; SSE-NEXT:    movdqa (%rdi), %xmm8
1679; SSE-NEXT:    movdqa 16(%rdi), %xmm11
1680; SSE-NEXT:    movdqa 48(%rdi), %xmm13
1681; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3]
1682; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1683; SSE-NEXT:    movdqa %xmm8, %xmm1
1684; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1685; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1686; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
1687; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1688; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1]
1689; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1690; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1691; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1692; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
1693; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1694; SSE-NEXT:    movdqa %xmm15, %xmm1
1695; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1696; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
1697; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1698; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1]
1699; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1700; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1701; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1702; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
1703; SSE-NEXT:    movdqa %xmm4, %xmm14
1704; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1705; SSE-NEXT:    movdqa %xmm10, %xmm1
1706; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1707; SSE-NEXT:    movdqa %xmm3, %xmm2
1708; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1709; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
1710; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1]
1711; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1712; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
1713; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1714; SSE-NEXT:    movdqa 96(%rdi), %xmm4
1715; SSE-NEXT:    movdqa 112(%rdi), %xmm10
1716; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
1717; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1718; SSE-NEXT:    movdqa %xmm4, %xmm1
1719; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1720; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1721; SSE-NEXT:    movdqa 144(%rdi), %xmm9
1722; SSE-NEXT:    movdqa 160(%rdi), %xmm6
1723; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
1724; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1725; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1]
1726; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1727; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
1728; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1729; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
1730; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3]
1731; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1732; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
1733; SSE-NEXT:    movdqa %xmm13, %xmm3
1734; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1735; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
1736; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1737; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
1738; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3]
1739; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1740; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
1741; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1742; SSE-NEXT:    movdqa %xmm5, %xmm3
1743; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1744; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
1745; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1746; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1747; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
1748; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3]
1749; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1750; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1751; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1752; SSE-NEXT:    movdqa %xmm3, %xmm2
1753; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1754; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1755; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1756; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
1757; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3]
1758; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1759; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3]
1760; SSE-NEXT:    movdqa %xmm9, %xmm14
1761; SSE-NEXT:    movdqa %xmm9, %xmm2
1762; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1763; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1764; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1765; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3]
1766; SSE-NEXT:    movdqa 80(%rdi), %xmm10
1767; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,1,1]
1768; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1769; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1770; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3]
1771; SSE-NEXT:    movdqa 32(%rdi), %xmm6
1772; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
1773; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1774; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1775; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3]
1776; SSE-NEXT:    movdqa 368(%rdi), %xmm0
1777; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1778; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1779; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1780; SSE-NEXT:    movdqa %xmm15, %xmm9
1781; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3]
1782; SSE-NEXT:    movdqa 320(%rdi), %xmm8
1783; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
1784; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1785; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1786; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
1787; SSE-NEXT:    movdqa 272(%rdi), %xmm15
1788; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1]
1789; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1790; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
1791; SSE-NEXT:    movdqa 224(%rdi), %xmm3
1792; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1793; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1794; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1795; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3]
1796; SSE-NEXT:    movdqa 176(%rdi), %xmm11
1797; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1]
1798; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1799; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1800; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
1801; SSE-NEXT:    movdqa 128(%rdi), %xmm4
1802; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1803; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1804; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1805; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3]
1806; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1807; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
1808; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1809; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
1810; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1811; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
1812; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1813; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1814; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3]
1815; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
1816; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1817; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1818; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1819; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
1820; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1821; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1822; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1823; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1824; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
1825; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
1826; SSE-NEXT:    movdqa %xmm3, %xmm9
1827; SSE-NEXT:    movdqa %xmm3, (%rsp) # 16-byte Spill
1828; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1829; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1830; SSE-NEXT:    # xmm12 = mem[2,3,2,3]
1831; SSE-NEXT:    movdqa %xmm15, %xmm3
1832; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1833; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1]
1834; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
1835; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
1836; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
1837; SSE-NEXT:    movdqa %xmm4, %xmm7
1838; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1839; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1840; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3]
1841; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
1842; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1843; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
1844; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
1845; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1846; SSE-NEXT:    movdqa %xmm6, %xmm1
1847; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1848; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
1849; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1850; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1]
1851; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
1852; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
1853; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
1854; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1855; SSE-NEXT:    movdqa %xmm14, %xmm1
1856; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1857; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
1858; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1859; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,0,1,1]
1860; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
1861; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
1862; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3]
1863; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1864; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1865; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
1866; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1867; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1]
1868; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1869; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
1870; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
1871; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1872; SSE-NEXT:    movdqa %xmm7, %xmm1
1873; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1874; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
1875; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1876; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1]
1877; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1878; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1879; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
1880; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1881; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
1882; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1883; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1884; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
1885; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
1886; SSE-NEXT:    movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1]
1887; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
1888; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1889; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
1890; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1891; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1892; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
1893; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
1894; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
1895; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1896; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
1897; SSE-NEXT:    pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload
1898; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
1899; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1900; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1901; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
1902; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
1903; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
1904; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
1905; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1906; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
1907; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1908; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1909; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
1910; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
1911; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
1912; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1913; SSE-NEXT:    movaps %xmm0, 16(%rsi)
1914; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1915; SSE-NEXT:    movaps %xmm0, 32(%rsi)
1916; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1917; SSE-NEXT:    movaps %xmm0, 48(%rsi)
1918; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1919; SSE-NEXT:    movaps %xmm0, (%rsi)
1920; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1921; SSE-NEXT:    movaps %xmm0, 16(%rdx)
1922; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1923; SSE-NEXT:    movaps %xmm0, 32(%rdx)
1924; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1925; SSE-NEXT:    movaps %xmm0, 48(%rdx)
1926; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1927; SSE-NEXT:    movaps %xmm0, (%rdx)
1928; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1929; SSE-NEXT:    movaps %xmm0, 16(%rcx)
1930; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1931; SSE-NEXT:    movaps %xmm0, 32(%rcx)
1932; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1933; SSE-NEXT:    movaps %xmm0, 48(%rcx)
1934; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1935; SSE-NEXT:    movaps %xmm0, (%rcx)
1936; SSE-NEXT:    movapd %xmm15, 16(%r8)
1937; SSE-NEXT:    movapd %xmm12, 32(%r8)
1938; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1939; SSE-NEXT:    movaps %xmm0, 48(%r8)
1940; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1941; SSE-NEXT:    movaps %xmm0, (%r8)
1942; SSE-NEXT:    movapd %xmm2, 16(%r9)
1943; SSE-NEXT:    movapd %xmm3, 32(%r9)
1944; SSE-NEXT:    movapd %xmm4, 48(%r9)
1945; SSE-NEXT:    movapd %xmm5, (%r9)
1946; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1947; SSE-NEXT:    movapd %xmm13, 16(%rax)
1948; SSE-NEXT:    movapd %xmm9, 32(%rax)
1949; SSE-NEXT:    movapd %xmm8, 48(%rax)
1950; SSE-NEXT:    movapd %xmm10, (%rax)
1951; SSE-NEXT:    addq $408, %rsp # imm = 0x198
1952; SSE-NEXT:    retq
1953;
1954; AVX-LABEL: load_i32_stride6_vf16:
1955; AVX:       # %bb.0:
1956; AVX-NEXT:    subq $328, %rsp # imm = 0x148
1957; AVX-NEXT:    vmovaps 224(%rdi), %ymm12
1958; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1959; AVX-NEXT:    vmovaps 288(%rdi), %ymm10
1960; AVX-NEXT:    vmovaps 256(%rdi), %ymm4
1961; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1962; AVX-NEXT:    vmovapd 160(%rdi), %ymm1
1963; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1964; AVX-NEXT:    vmovapd 128(%rdi), %ymm13
1965; AVX-NEXT:    vmovaps 32(%rdi), %ymm6
1966; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1967; AVX-NEXT:    vmovaps (%rdi), %ymm14
1968; AVX-NEXT:    vmovaps 96(%rdi), %ymm9
1969; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
1970; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1971; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm0, %ymm2
1972; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm9[2,0],ymm2[0,0],ymm9[6,4],ymm2[4,4]
1973; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm2[2,2],ymm5[6,4],ymm2[6,6]
1974; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5],ymm14[6,7]
1975; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm7
1976; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm6[0,1],xmm7[2,3]
1977; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[0,3]
1978; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7]
1979; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm1[0,1]
1980; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[1],ymm8[3],ymm13[2]
1981; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
1982; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm11[6,7]
1983; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1984; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm4, %ymm1
1985; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm10[2,0],ymm1[0,0],ymm10[6,4],ymm1[4,4]
1986; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm1[2,2],ymm5[6,4],ymm1[6,6]
1987; AVX-NEXT:    vmovaps 192(%rdi), %ymm15
1988; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7]
1989; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm0
1990; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm3[0,1],xmm0[2,3]
1991; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,3]
1992; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7]
1993; AVX-NEXT:    vmovapd 352(%rdi), %ymm4
1994; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1995; AVX-NEXT:    vmovapd 320(%rdi), %ymm12
1996; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[0,1]
1997; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[1],ymm4[3],ymm12[2]
1998; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
1999; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7]
2000; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2001; AVX-NEXT:    vmovups %ymm9, (%rsp) # 32-byte Spill
2002; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4]
2003; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm2[2,3],ymm11[6,4],ymm2[6,7]
2004; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,0],xmm7[3,0]
2005; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[1,3]
2006; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7]
2007; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm13[1,3],ymm8[7,5],ymm13[5,7]
2008; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
2009; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7]
2010; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2011; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2012; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[3,0],ymm1[1,0],ymm10[7,4],ymm1[5,4]
2013; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7]
2014; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[1,0],xmm0[3,0]
2015; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
2016; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
2017; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[3,1],ymm12[1,3],ymm4[7,5],ymm12[5,7]
2018; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2019; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2020; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2021; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
2022; AVX-NEXT:    # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
2023; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2024; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm7[2,0],ymm9[6,5],ymm7[6,4]
2025; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
2026; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm5
2027; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm5[2,3]
2028; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
2029; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2030; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7]
2031; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,0,1]
2032; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[0,0],ymm4[2,0],ymm8[4,4],ymm4[6,4]
2033; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7]
2034; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2035; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload
2036; AVX-NEXT:    # ymm11 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
2037; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2038; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[2,1],ymm3[2,0],ymm10[6,5],ymm3[6,4]
2039; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
2040; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm15
2041; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm11[2,0],xmm15[2,3]
2042; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7]
2043; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2044; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
2045; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1]
2046; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm14[2,0],ymm0[4,4],ymm14[6,4]
2047; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
2048; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2049; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[3,1],ymm8[4,5],ymm4[7,5]
2050; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm5[3,3]
2051; AVX-NEXT:    vmovaps %ymm7, %ymm1
2052; AVX-NEXT:    vmovups (%rsp), %ymm7 # 32-byte Reload
2053; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5]
2054; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
2055; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
2056; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
2057; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2058; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm14[3,1],ymm0[4,5],ymm14[7,5]
2059; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm11[3,1],xmm15[3,3]
2060; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2061; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5]
2062; AVX-NEXT:    vmovaps %ymm3, %ymm15
2063; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
2064; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
2065; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
2066; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2067; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7]
2068; AVX-NEXT:    vmovaps 32(%rdi), %xmm3
2069; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3]
2070; AVX-NEXT:    vmovaps 16(%rdi), %xmm5
2071; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
2072; AVX-NEXT:    vmovapd 80(%rdi), %xmm6
2073; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[3]
2074; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,0],ymm7[4,5],ymm8[6,4]
2075; AVX-NEXT:    vmovaps %ymm7, %ymm13
2076; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7]
2077; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1]
2078; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4]
2079; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm8[0,2],ymm11[2,0],ymm8[4,6],ymm11[6,4]
2080; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7]
2081; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
2082; AVX-NEXT:    vmovaps 224(%rdi), %xmm12
2083; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm12[2,2,3,3]
2084; AVX-NEXT:    vmovaps 208(%rdi), %xmm0
2085; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3]
2086; AVX-NEXT:    vmovapd 272(%rdi), %xmm1
2087; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[3]
2088; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,0],ymm4[4,5],ymm7[6,4]
2089; AVX-NEXT:    vmovaps %ymm4, %ymm10
2090; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7]
2091; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,0,1]
2092; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[2,0],ymm14[0,0],ymm11[6,4],ymm14[4,4]
2093; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4]
2094; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7]
2095; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[3,0],ymm8[1,0],ymm9[7,4],ymm8[5,4]
2096; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm8[0,3],ymm7[2,0],ymm8[4,7],ymm7[6,4]
2097; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
2098; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload
2099; AVX-NEXT:    # ymm5 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7]
2100; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm13[1,1],ymm5[2,0],ymm13[5,5],ymm5[6,4]
2101; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
2102; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7]
2103; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7]
2104; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm11[3,0],ymm14[1,0],ymm11[7,4],ymm14[5,4]
2105; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,3],ymm5[2,0],ymm14[4,7],ymm5[6,4]
2106; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
2107; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7]
2108; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[2,0],ymm10[5,5],ymm1[6,4]
2109; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
2110; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
2111; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
2112; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2113; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
2114; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2115; AVX-NEXT:    vmovaps %ymm1, (%rsi)
2116; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2117; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
2118; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2119; AVX-NEXT:    vmovaps %ymm1, (%rdx)
2120; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2121; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
2122; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2123; AVX-NEXT:    vmovaps %ymm1, (%rcx)
2124; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2125; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
2126; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2127; AVX-NEXT:    vmovaps %ymm1, (%r8)
2128; AVX-NEXT:    vmovaps %ymm4, 32(%r9)
2129; AVX-NEXT:    vmovaps %ymm2, (%r9)
2130; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2131; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
2132; AVX-NEXT:    vmovaps %ymm3, (%rax)
2133; AVX-NEXT:    addq $328, %rsp # imm = 0x148
2134; AVX-NEXT:    vzeroupper
2135; AVX-NEXT:    retq
2136;
2137; AVX2-LABEL: load_i32_stride6_vf16:
2138; AVX2:       # %bb.0:
2139; AVX2-NEXT:    subq $392, %rsp # imm = 0x188
2140; AVX2-NEXT:    vmovaps 288(%rdi), %ymm10
2141; AVX2-NEXT:    vmovaps 224(%rdi), %ymm1
2142; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2143; AVX2-NEXT:    vmovaps 192(%rdi), %ymm2
2144; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2145; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
2146; AVX2-NEXT:    vmovaps 160(%rdi), %ymm3
2147; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2148; AVX2-NEXT:    vmovaps 96(%rdi), %ymm15
2149; AVX2-NEXT:    vmovaps (%rdi), %ymm4
2150; AVX2-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
2151; AVX2-NEXT:    vmovaps 32(%rdi), %ymm5
2152; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2153; AVX2-NEXT:    vmovaps 64(%rdi), %ymm13
2154; AVX2-NEXT:    vmovaps {{.*#+}} xmm6 = [0,6,4,u]
2155; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
2156; AVX2-NEXT:    vpermps %ymm8, %ymm6, %ymm7
2157; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm15[0,1]
2158; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
2159; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
2160; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
2161; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7]
2162; AVX2-NEXT:    vmovaps %ymm0, %ymm7
2163; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2164; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2]
2165; AVX2-NEXT:    vpermps %ymm4, %ymm12, %ymm14
2166; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2167; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7]
2168; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2169; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
2170; AVX2-NEXT:    vpermps %ymm3, %ymm6, %ymm0
2171; AVX2-NEXT:    vmovaps 256(%rdi), %ymm11
2172; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2173; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1]
2174; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7]
2175; AVX2-NEXT:    vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6]
2176; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7]
2177; AVX2-NEXT:    vmovaps 320(%rdi), %ymm5
2178; AVX2-NEXT:    vmovaps 352(%rdi), %ymm6
2179; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2180; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2181; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2182; AVX2-NEXT:    vpermps %ymm0, %ymm12, %ymm10
2183; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
2184; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2185; AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,7,5,u]
2186; AVX2-NEXT:    vpermps %ymm8, %ymm2, %ymm8
2187; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
2188; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
2189; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3]
2190; AVX2-NEXT:    vpermps %ymm4, %ymm14, %ymm4
2191; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
2192; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2193; AVX2-NEXT:    vpermps %ymm3, %ymm2, %ymm2
2194; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
2195; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
2196; AVX2-NEXT:    vpermps %ymm0, %ymm14, %ymm0
2197; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2198; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2199; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
2200; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
2201; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
2202; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2203; AVX2-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
2204; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
2205; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[2,0,2,3,6,4,6,7]
2206; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
2207; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
2208; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
2209; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7]
2210; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[0,0,2,0,4,4,6,4]
2211; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
2212; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
2213; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2214; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2215; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
2216; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
2217; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
2218; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2219; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
2220; AVX2-NEXT:    # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7]
2221; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[2,0,2,3,6,4,6,7]
2222; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
2223; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7]
2224; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7]
2225; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[0,0,2,0,4,4,6,4]
2226; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2227; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
2228; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2229; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7]
2230; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
2231; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
2232; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,1,3,3,7,5,7,7]
2233; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
2234; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2235; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,3,1,4,5,7,5]
2236; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
2237; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2238; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
2239; AVX2-NEXT:    vmovaps %ymm7, %ymm6
2240; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
2241; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
2242; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,1,3,3,7,5,7,7]
2243; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
2244; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
2245; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,7,5]
2246; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
2247; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7]
2248; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm15[4,5,6,7]
2249; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2250; AVX2-NEXT:    vmovaps 80(%rdi), %xmm4
2251; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
2252; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
2253; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2254; AVX2-NEXT:    vpermps %ymm3, %ymm7, %ymm5
2255; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
2256; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
2257; AVX2-NEXT:    # ymm5 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7]
2258; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6]
2259; AVX2-NEXT:    # ymm10 = mem[0,1,0,1]
2260; AVX2-NEXT:    vpermps %ymm5, %ymm10, %ymm13
2261; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7]
2262; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
2263; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2264; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
2265; AVX2-NEXT:    # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7]
2266; AVX2-NEXT:    vmovaps 272(%rdi), %xmm13
2267; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
2268; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6]
2269; AVX2-NEXT:    vpermps %ymm11, %ymm7, %ymm7
2270; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
2271; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2272; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
2273; AVX2-NEXT:    # ymm12 = mem[0,1],ymm9[2,3],mem[4,5,6,7]
2274; AVX2-NEXT:    vpermps %ymm12, %ymm10, %ymm10
2275; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
2276; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
2277; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
2278; AVX2-NEXT:    vpermps %ymm3, %ymm14, %ymm3
2279; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
2280; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
2281; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
2282; AVX2-NEXT:    vpermps %ymm5, %ymm3, %ymm4
2283; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
2284; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5]
2285; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7]
2286; AVX2-NEXT:    vpermps %ymm11, %ymm14, %ymm5
2287; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
2288; AVX2-NEXT:    vpermps %ymm12, %ymm3, %ymm3
2289; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
2290; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2291; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
2292; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2293; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
2294; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2295; AVX2-NEXT:    vmovaps %ymm4, 32(%rdx)
2296; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2297; AVX2-NEXT:    vmovaps %ymm4, (%rdx)
2298; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2299; AVX2-NEXT:    vmovaps %ymm4, 32(%rcx)
2300; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2301; AVX2-NEXT:    vmovaps %ymm4, (%rcx)
2302; AVX2-NEXT:    vmovaps %ymm8, 32(%r8)
2303; AVX2-NEXT:    vmovaps %ymm0, (%r8)
2304; AVX2-NEXT:    vmovaps %ymm7, 32(%r9)
2305; AVX2-NEXT:    vmovaps %ymm1, (%r9)
2306; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2307; AVX2-NEXT:    vmovaps %ymm3, 32(%rax)
2308; AVX2-NEXT:    vmovaps %ymm2, (%rax)
2309; AVX2-NEXT:    addq $392, %rsp # imm = 0x188
2310; AVX2-NEXT:    vzeroupper
2311; AVX2-NEXT:    retq
2312;
2313; AVX2-FP-LABEL: load_i32_stride6_vf16:
2314; AVX2-FP:       # %bb.0:
2315; AVX2-FP-NEXT:    subq $392, %rsp # imm = 0x188
2316; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm10
2317; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm1
2318; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2319; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm2
2320; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2321; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
2322; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm3
2323; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2324; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm15
2325; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm4
2326; AVX2-FP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
2327; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm5
2328; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2329; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm13
2330; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm6 = [0,6,4,u]
2331; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
2332; AVX2-FP-NEXT:    vpermps %ymm8, %ymm6, %ymm7
2333; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm15[0,1]
2334; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
2335; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
2336; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
2337; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7]
2338; AVX2-FP-NEXT:    vmovaps %ymm0, %ymm7
2339; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2340; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2]
2341; AVX2-FP-NEXT:    vpermps %ymm4, %ymm12, %ymm14
2342; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2343; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7]
2344; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2345; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
2346; AVX2-FP-NEXT:    vpermps %ymm3, %ymm6, %ymm0
2347; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm11
2348; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2349; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1]
2350; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7]
2351; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6]
2352; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7]
2353; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm5
2354; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm6
2355; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2356; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2357; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2358; AVX2-FP-NEXT:    vpermps %ymm0, %ymm12, %ymm10
2359; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
2360; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2361; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,7,5,u]
2362; AVX2-FP-NEXT:    vpermps %ymm8, %ymm2, %ymm8
2363; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
2364; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
2365; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3]
2366; AVX2-FP-NEXT:    vpermps %ymm4, %ymm14, %ymm4
2367; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
2368; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2369; AVX2-FP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
2370; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
2371; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
2372; AVX2-FP-NEXT:    vpermps %ymm0, %ymm14, %ymm0
2373; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2374; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2375; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
2376; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
2377; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
2378; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2379; AVX2-FP-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
2380; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
2381; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[2,0,2,3,6,4,6,7]
2382; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
2383; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
2384; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
2385; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7]
2386; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[0,0,2,0,4,4,6,4]
2387; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
2388; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
2389; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2390; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2391; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
2392; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
2393; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
2394; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2395; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
2396; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7]
2397; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[2,0,2,3,6,4,6,7]
2398; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
2399; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7]
2400; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7]
2401; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[0,0,2,0,4,4,6,4]
2402; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2403; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
2404; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2405; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7]
2406; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
2407; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
2408; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,1,3,3,7,5,7,7]
2409; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
2410; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2411; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,3,1,4,5,7,5]
2412; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
2413; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2414; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
2415; AVX2-FP-NEXT:    vmovaps %ymm7, %ymm6
2416; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
2417; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
2418; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,1,3,3,7,5,7,7]
2419; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
2420; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
2421; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,7,5]
2422; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
2423; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7]
2424; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm15[4,5,6,7]
2425; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2426; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm4
2427; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
2428; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
2429; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2430; AVX2-FP-NEXT:    vpermps %ymm3, %ymm7, %ymm5
2431; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
2432; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
2433; AVX2-FP-NEXT:    # ymm5 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7]
2434; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6]
2435; AVX2-FP-NEXT:    # ymm10 = mem[0,1,0,1]
2436; AVX2-FP-NEXT:    vpermps %ymm5, %ymm10, %ymm13
2437; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7]
2438; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
2439; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2440; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
2441; AVX2-FP-NEXT:    # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7]
2442; AVX2-FP-NEXT:    vmovaps 272(%rdi), %xmm13
2443; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
2444; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6]
2445; AVX2-FP-NEXT:    vpermps %ymm11, %ymm7, %ymm7
2446; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
2447; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2448; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
2449; AVX2-FP-NEXT:    # ymm12 = mem[0,1],ymm9[2,3],mem[4,5,6,7]
2450; AVX2-FP-NEXT:    vpermps %ymm12, %ymm10, %ymm10
2451; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
2452; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
2453; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
2454; AVX2-FP-NEXT:    vpermps %ymm3, %ymm14, %ymm3
2455; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
2456; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
2457; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
2458; AVX2-FP-NEXT:    vpermps %ymm5, %ymm3, %ymm4
2459; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
2460; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5]
2461; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7]
2462; AVX2-FP-NEXT:    vpermps %ymm11, %ymm14, %ymm5
2463; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
2464; AVX2-FP-NEXT:    vpermps %ymm12, %ymm3, %ymm3
2465; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
2466; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2467; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
2468; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2469; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
2470; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2471; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
2472; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2473; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
2474; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2475; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
2476; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2477; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
2478; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%r8)
2479; AVX2-FP-NEXT:    vmovaps %ymm0, (%r8)
2480; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%r9)
2481; AVX2-FP-NEXT:    vmovaps %ymm1, (%r9)
2482; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2483; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rax)
2484; AVX2-FP-NEXT:    vmovaps %ymm2, (%rax)
2485; AVX2-FP-NEXT:    addq $392, %rsp # imm = 0x188
2486; AVX2-FP-NEXT:    vzeroupper
2487; AVX2-FP-NEXT:    retq
2488;
2489; AVX2-FCP-LABEL: load_i32_stride6_vf16:
2490; AVX2-FCP:       # %bb.0:
2491; AVX2-FCP-NEXT:    subq $360, %rsp # imm = 0x168
2492; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm10
2493; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm6
2494; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2495; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm2
2496; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2497; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm0
2498; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2499; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm13
2500; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
2501; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm4
2502; AVX2-FCP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
2503; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm5
2504; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2505; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm15
2506; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [0,6,4,u]
2507; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
2508; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm12, %ymm7
2509; AVX2-FCP-NEXT:    vmovaps %ymm1, %ymm5
2510; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm15[0,1],ymm1[0,1]
2511; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7]
2512; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2513; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
2514; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
2515; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm0[4,5,6,7]
2516; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4,2,4,2,4,2,4,2]
2517; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm1, %ymm14
2518; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2519; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7]
2520; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2521; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
2522; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm12
2523; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm11
2524; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2525; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1]
2526; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm10[6,7]
2527; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm6[0,2,2,2,4,6,6,6]
2528; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2],ymm14[3,4,5,6,7]
2529; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm7
2530; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm12
2531; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm7[4,5,6,7]
2532; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2533; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm10
2534; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
2535; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2536; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,7,5,u]
2537; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm2, %ymm8
2538; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
2539; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
2540; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3]
2541; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm14, %ymm4
2542; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
2543; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2544; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
2545; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[1,3,2,3,5,7,6,7]
2546; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
2547; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm14, %ymm0
2548; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2549; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2550; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7]
2551; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,0,6,4,2,0,6,7]
2552; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2553; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm2 = [2,0,6,7]
2554; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
2555; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
2556; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7]
2557; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm4
2558; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7]
2559; AVX2-FCP-NEXT:    vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
2560; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm13[4,5],mem[6,7]
2561; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm8 = [0,0,6,4,0,0,6,4]
2562; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
2563; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm8, %ymm10
2564; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7]
2565; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2566; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2567; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7]
2568; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2569; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2570; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2571; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
2572; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
2573; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
2574; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7]
2575; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm8, %ymm8
2576; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
2577; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2578; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
2579; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7]
2580; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4],ymm0[5],ymm3[6,7]
2581; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,1,7,5,0,1,7,5]
2582; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
2583; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm4
2584; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm8 = [3,1,7,5,0,u,u,u]
2585; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm8, %ymm0
2586; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
2587; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm4 = mem[3,3,3,3]
2588; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
2589; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4],ymm4[5],ymm1[6,7]
2590; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm8, %ymm1
2591; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
2592; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7]
2593; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
2594; AVX2-FCP-NEXT:    # ymm2 = ymm15[0,1,2,3],mem[4,5,6,7]
2595; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm9[4,5,6,7]
2596; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm4
2597; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
2598; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
2599; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2600; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm7, %ymm5
2601; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
2602; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload
2603; AVX2-FCP-NEXT:    # ymm5 = ymm13[0,1],mem[2,3],ymm13[4,5,6,7]
2604; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6]
2605; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
2606; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm10, %ymm13
2607; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7]
2608; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
2609; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2610; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
2611; AVX2-FCP-NEXT:    # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7]
2612; AVX2-FCP-NEXT:    vmovaps 272(%rdi), %xmm13
2613; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
2614; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6]
2615; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm7, %ymm7
2616; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
2617; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
2618; AVX2-FCP-NEXT:    # ymm12 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7]
2619; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm10, %ymm10
2620; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
2621; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
2622; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
2623; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm14, %ymm3
2624; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
2625; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
2626; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
2627; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm3, %ymm4
2628; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
2629; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5]
2630; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7]
2631; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm14, %ymm5
2632; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
2633; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm3, %ymm3
2634; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
2635; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2636; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rsi)
2637; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2638; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
2639; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2640; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rdx)
2641; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2642; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
2643; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2644; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rcx)
2645; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2646; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
2647; AVX2-FCP-NEXT:    vmovaps %ymm8, 32(%r8)
2648; AVX2-FCP-NEXT:    vmovaps %ymm0, (%r8)
2649; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%r9)
2650; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r9)
2651; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2652; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rax)
2653; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
2654; AVX2-FCP-NEXT:    addq $360, %rsp # imm = 0x168
2655; AVX2-FCP-NEXT:    vzeroupper
2656; AVX2-FCP-NEXT:    retq
2657;
2658; AVX512-LABEL: load_i32_stride6_vf16:
2659; AVX512:       # %bb.0:
2660; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2661; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm0
2662; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm1
2663; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm3
2664; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm4
2665; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm5
2666; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm6
2667; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
2668; AVX512-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2669; AVX512-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
2670; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0]
2671; AVX512-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
2672; AVX512-NEXT:    movb $56, %dil
2673; AVX512-NEXT:    kmovw %edi, %k2
2674; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k2}
2675; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2676; AVX512-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2677; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
2678; AVX512-NEXT:    movw $-2048, %di # imm = 0xF800
2679; AVX512-NEXT:    kmovw %edi, %k1
2680; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm2 {%k1}
2681; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
2682; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2683; AVX512-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
2684; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0]
2685; AVX512-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
2686; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
2687; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2688; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2689; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
2690; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
2691; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
2692; AVX512-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
2693; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
2694; AVX512-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2695; AVX512-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
2696; AVX512-NEXT:    movw $31, %di
2697; AVX512-NEXT:    kmovw %edi, %k2
2698; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
2699; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2700; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2701; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2702; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
2703; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
2704; AVX512-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
2705; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
2706; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2707; AVX512-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
2708; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k2}
2709; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
2710; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2711; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2712; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k1}
2713; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
2714; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2715; AVX512-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
2716; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0]
2717; AVX512-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
2718; AVX512-NEXT:    movw $992, %di # imm = 0x3E0
2719; AVX512-NEXT:    kmovw %edi, %k1
2720; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
2721; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
2722; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2723; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
2724; AVX512-NEXT:    movb $-32, %dil
2725; AVX512-NEXT:    kmovw %edi, %k2
2726; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k2}
2727; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
2728; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2729; AVX512-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
2730; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0]
2731; AVX512-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
2732; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
2733; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
2734; AVX512-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2735; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
2736; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
2737; AVX512-NEXT:    vmovdqa64 %zmm2, (%rsi)
2738; AVX512-NEXT:    vmovdqa64 %zmm7, (%rdx)
2739; AVX512-NEXT:    vmovdqa64 %zmm9, (%rcx)
2740; AVX512-NEXT:    vmovdqa64 %zmm10, (%r8)
2741; AVX512-NEXT:    vmovdqa64 %zmm11, (%r9)
2742; AVX512-NEXT:    vmovdqa64 %zmm5, (%rax)
2743; AVX512-NEXT:    vzeroupper
2744; AVX512-NEXT:    retq
2745;
2746; AVX512-FCP-LABEL: load_i32_stride6_vf16:
2747; AVX512-FCP:       # %bb.0:
2748; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2749; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
2750; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
2751; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
2752; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
2753; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
2754; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm6
2755; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
2756; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2757; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
2758; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0]
2759; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
2760; AVX512-FCP-NEXT:    movb $56, %dil
2761; AVX512-FCP-NEXT:    kmovw %edi, %k2
2762; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k2}
2763; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2764; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2765; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
2766; AVX512-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
2767; AVX512-FCP-NEXT:    kmovw %edi, %k1
2768; AVX512-FCP-NEXT:    vmovdqa32 %zmm7, %zmm2 {%k1}
2769; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
2770; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2771; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
2772; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0]
2773; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
2774; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
2775; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2776; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2777; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
2778; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
2779; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
2780; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
2781; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
2782; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2783; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
2784; AVX512-FCP-NEXT:    movw $31, %di
2785; AVX512-FCP-NEXT:    kmovw %edi, %k2
2786; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
2787; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2788; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2789; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2790; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
2791; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
2792; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
2793; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
2794; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2795; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
2796; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k2}
2797; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
2798; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2799; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2800; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k1}
2801; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
2802; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2803; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
2804; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0]
2805; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
2806; AVX512-FCP-NEXT:    movw $992, %di # imm = 0x3E0
2807; AVX512-FCP-NEXT:    kmovw %edi, %k1
2808; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
2809; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
2810; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2811; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
2812; AVX512-FCP-NEXT:    movb $-32, %dil
2813; AVX512-FCP-NEXT:    kmovw %edi, %k2
2814; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k2}
2815; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
2816; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2817; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
2818; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0]
2819; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
2820; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
2821; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
2822; AVX512-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2823; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
2824; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
2825; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rsi)
2826; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
2827; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, (%rcx)
2828; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, (%r8)
2829; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, (%r9)
2830; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
2831; AVX512-FCP-NEXT:    vzeroupper
2832; AVX512-FCP-NEXT:    retq
2833;
2834; AVX512DQ-LABEL: load_i32_stride6_vf16:
2835; AVX512DQ:       # %bb.0:
2836; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2837; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm0
2838; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm1
2839; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm3
2840; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm4
2841; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm5
2842; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm6
2843; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
2844; AVX512DQ-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2845; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
2846; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0]
2847; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
2848; AVX512DQ-NEXT:    movb $56, %dil
2849; AVX512DQ-NEXT:    kmovw %edi, %k2
2850; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k2}
2851; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2852; AVX512DQ-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2853; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
2854; AVX512DQ-NEXT:    movw $-2048, %di # imm = 0xF800
2855; AVX512DQ-NEXT:    kmovw %edi, %k1
2856; AVX512DQ-NEXT:    vmovdqa32 %zmm7, %zmm2 {%k1}
2857; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
2858; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2859; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
2860; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0]
2861; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
2862; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
2863; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2864; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2865; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
2866; AVX512DQ-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
2867; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
2868; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
2869; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
2870; AVX512DQ-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2871; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
2872; AVX512DQ-NEXT:    movw $31, %di
2873; AVX512DQ-NEXT:    kmovw %edi, %k2
2874; AVX512DQ-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
2875; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2876; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2877; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2878; AVX512DQ-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
2879; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
2880; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
2881; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
2882; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2883; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
2884; AVX512DQ-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k2}
2885; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
2886; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2887; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2888; AVX512DQ-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k1}
2889; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
2890; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2891; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
2892; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0]
2893; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
2894; AVX512DQ-NEXT:    movw $992, %di # imm = 0x3E0
2895; AVX512DQ-NEXT:    kmovw %edi, %k1
2896; AVX512DQ-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
2897; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
2898; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2899; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
2900; AVX512DQ-NEXT:    movb $-32, %dil
2901; AVX512DQ-NEXT:    kmovw %edi, %k2
2902; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k2}
2903; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
2904; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2905; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
2906; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0]
2907; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
2908; AVX512DQ-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
2909; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
2910; AVX512DQ-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2911; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
2912; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
2913; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rsi)
2914; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rdx)
2915; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rcx)
2916; AVX512DQ-NEXT:    vmovdqa64 %zmm10, (%r8)
2917; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%r9)
2918; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
2919; AVX512DQ-NEXT:    vzeroupper
2920; AVX512DQ-NEXT:    retq
2921;
2922; AVX512DQ-FCP-LABEL: load_i32_stride6_vf16:
2923; AVX512DQ-FCP:       # %bb.0:
2924; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2925; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
2926; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
2927; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
2928; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
2929; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
2930; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm6
2931; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
2932; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2933; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
2934; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0]
2935; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
2936; AVX512DQ-FCP-NEXT:    movb $56, %dil
2937; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
2938; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k2}
2939; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2940; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2941; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
2942; AVX512DQ-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
2943; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
2944; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm7, %zmm2 {%k1}
2945; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
2946; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2947; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
2948; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0]
2949; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
2950; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
2951; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2952; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2953; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
2954; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
2955; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
2956; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
2957; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
2958; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2959; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
2960; AVX512DQ-FCP-NEXT:    movw $31, %di
2961; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
2962; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
2963; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2964; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2965; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2966; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
2967; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
2968; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
2969; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
2970; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2971; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
2972; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k2}
2973; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
2974; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2975; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2976; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k1}
2977; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
2978; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2979; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
2980; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0]
2981; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
2982; AVX512DQ-FCP-NEXT:    movw $992, %di # imm = 0x3E0
2983; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
2984; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
2985; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
2986; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2987; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
2988; AVX512DQ-FCP-NEXT:    movb $-32, %dil
2989; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
2990; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k2}
2991; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
2992; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2993; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
2994; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0]
2995; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
2996; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
2997; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
2998; AVX512DQ-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2999; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
3000; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
3001; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rsi)
3002; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
3003; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%rcx)
3004; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%r8)
3005; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%r9)
3006; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
3007; AVX512DQ-FCP-NEXT:    vzeroupper
3008; AVX512DQ-FCP-NEXT:    retq
3009;
3010; AVX512BW-LABEL: load_i32_stride6_vf16:
3011; AVX512BW:       # %bb.0:
3012; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3013; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
3014; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
3015; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
3016; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
3017; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
3018; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm6
3019; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
3020; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
3021; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
3022; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0]
3023; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
3024; AVX512BW-NEXT:    movb $56, %dil
3025; AVX512BW-NEXT:    kmovd %edi, %k2
3026; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k2}
3027; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
3028; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
3029; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
3030; AVX512BW-NEXT:    movw $-2048, %di # imm = 0xF800
3031; AVX512BW-NEXT:    kmovd %edi, %k1
3032; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm2 {%k1}
3033; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
3034; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3035; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
3036; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0]
3037; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
3038; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
3039; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
3040; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3041; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
3042; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
3043; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
3044; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
3045; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
3046; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3047; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
3048; AVX512BW-NEXT:    movw $31, %di
3049; AVX512BW-NEXT:    kmovd %edi, %k2
3050; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
3051; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
3052; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3053; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
3054; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
3055; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
3056; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
3057; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
3058; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3059; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
3060; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k2}
3061; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
3062; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3063; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
3064; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k1}
3065; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
3066; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3067; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
3068; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0]
3069; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
3070; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
3071; AVX512BW-NEXT:    kmovd %edi, %k1
3072; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
3073; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
3074; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3075; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
3076; AVX512BW-NEXT:    movb $-32, %dil
3077; AVX512BW-NEXT:    kmovd %edi, %k2
3078; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k2}
3079; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
3080; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3081; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
3082; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0]
3083; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
3084; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
3085; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
3086; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3087; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
3088; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
3089; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rsi)
3090; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rdx)
3091; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rcx)
3092; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%r8)
3093; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%r9)
3094; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
3095; AVX512BW-NEXT:    vzeroupper
3096; AVX512BW-NEXT:    retq
3097;
3098; AVX512BW-FCP-LABEL: load_i32_stride6_vf16:
3099; AVX512BW-FCP:       # %bb.0:
3100; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3101; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
3102; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
3103; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
3104; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
3105; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
3106; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm6
3107; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
3108; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
3109; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
3110; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0]
3111; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
3112; AVX512BW-FCP-NEXT:    movb $56, %dil
3113; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
3114; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k2}
3115; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
3116; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
3117; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
3118; AVX512BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
3119; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
3120; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm2 {%k1}
3121; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
3122; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3123; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
3124; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0]
3125; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
3126; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
3127; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
3128; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3129; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
3130; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
3131; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
3132; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
3133; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
3134; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3135; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
3136; AVX512BW-FCP-NEXT:    movw $31, %di
3137; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
3138; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
3139; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
3140; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3141; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
3142; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
3143; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
3144; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
3145; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
3146; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3147; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
3148; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k2}
3149; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
3150; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3151; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
3152; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k1}
3153; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
3154; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3155; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
3156; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0]
3157; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
3158; AVX512BW-FCP-NEXT:    movw $992, %di # imm = 0x3E0
3159; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
3160; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
3161; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
3162; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3163; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
3164; AVX512BW-FCP-NEXT:    movb $-32, %dil
3165; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
3166; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k2}
3167; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
3168; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3169; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
3170; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0]
3171; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
3172; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
3173; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
3174; AVX512BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3175; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
3176; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
3177; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rsi)
3178; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
3179; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rcx)
3180; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%r8)
3181; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, (%r9)
3182; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
3183; AVX512BW-FCP-NEXT:    vzeroupper
3184; AVX512BW-FCP-NEXT:    retq
3185;
3186; AVX512DQ-BW-LABEL: load_i32_stride6_vf16:
3187; AVX512DQ-BW:       # %bb.0:
3188; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3189; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
3190; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
3191; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm3
3192; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
3193; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
3194; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm6
3195; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
3196; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
3197; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
3198; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0]
3199; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
3200; AVX512DQ-BW-NEXT:    movb $56, %dil
3201; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
3202; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k2}
3203; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
3204; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
3205; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
3206; AVX512DQ-BW-NEXT:    movw $-2048, %di # imm = 0xF800
3207; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
3208; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm7, %zmm2 {%k1}
3209; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
3210; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3211; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
3212; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0]
3213; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
3214; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
3215; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
3216; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3217; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
3218; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
3219; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
3220; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
3221; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
3222; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3223; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
3224; AVX512DQ-BW-NEXT:    movw $31, %di
3225; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
3226; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
3227; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
3228; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3229; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
3230; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
3231; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
3232; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
3233; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
3234; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3235; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
3236; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k2}
3237; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
3238; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3239; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
3240; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k1}
3241; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
3242; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3243; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
3244; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0]
3245; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
3246; AVX512DQ-BW-NEXT:    movw $992, %di # imm = 0x3E0
3247; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
3248; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
3249; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
3250; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3251; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
3252; AVX512DQ-BW-NEXT:    movb $-32, %dil
3253; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
3254; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k2}
3255; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
3256; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3257; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
3258; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0]
3259; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
3260; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
3261; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
3262; AVX512DQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3263; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
3264; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
3265; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rsi)
3266; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rdx)
3267; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%rcx)
3268; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%r8)
3269; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, (%r9)
3270; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, (%rax)
3271; AVX512DQ-BW-NEXT:    vzeroupper
3272; AVX512DQ-BW-NEXT:    retq
3273;
3274; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf16:
3275; AVX512DQ-BW-FCP:       # %bb.0:
3276; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3277; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
3278; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
3279; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
3280; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
3281; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
3282; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm6
3283; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
3284; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
3285; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
3286; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0]
3287; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
3288; AVX512DQ-BW-FCP-NEXT:    movb $56, %dil
3289; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
3290; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k2}
3291; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
3292; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
3293; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
3294; AVX512DQ-BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
3295; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
3296; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm2 {%k1}
3297; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
3298; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3299; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
3300; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0]
3301; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
3302; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
3303; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
3304; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3305; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
3306; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
3307; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
3308; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
3309; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
3310; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3311; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
3312; AVX512DQ-BW-FCP-NEXT:    movw $31, %di
3313; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
3314; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
3315; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
3316; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3317; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
3318; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
3319; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0]
3320; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
3321; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
3322; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3323; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
3324; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k2}
3325; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
3326; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3327; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
3328; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm10 {%k1}
3329; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
3330; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3331; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
3332; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0]
3333; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
3334; AVX512DQ-BW-FCP-NEXT:    movw $992, %di # imm = 0x3E0
3335; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
3336; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
3337; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
3338; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3339; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
3340; AVX512DQ-BW-FCP-NEXT:    movb $-32, %dil
3341; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
3342; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k2}
3343; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
3344; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3345; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
3346; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0]
3347; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
3348; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
3349; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
3350; AVX512DQ-BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3351; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
3352; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
3353; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rsi)
3354; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
3355; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rcx)
3356; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%r8)
3357; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, (%r9)
3358; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
3359; AVX512DQ-BW-FCP-NEXT:    vzeroupper
3360; AVX512DQ-BW-FCP-NEXT:    retq
3361  %wide.vec = load <96 x i32>, ptr %in.vec, align 64
3362  %strided.vec0 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90>
3363  %strided.vec1 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91>
3364  %strided.vec2 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92>
3365  %strided.vec3 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93>
3366  %strided.vec4 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94>
3367  %strided.vec5 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95>
3368  store <16 x i32> %strided.vec0, ptr %out.vec0, align 64
3369  store <16 x i32> %strided.vec1, ptr %out.vec1, align 64
3370  store <16 x i32> %strided.vec2, ptr %out.vec2, align 64
3371  store <16 x i32> %strided.vec3, ptr %out.vec3, align 64
3372  store <16 x i32> %strided.vec4, ptr %out.vec4, align 64
3373  store <16 x i32> %strided.vec5, ptr %out.vec5, align 64
3374  ret void
3375}
3376
3377define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
3378; SSE-LABEL: load_i32_stride6_vf32:
3379; SSE:       # %bb.0:
3380; SSE-NEXT:    subq $1032, %rsp # imm = 0x408
3381; SSE-NEXT:    movdqa 64(%rdi), %xmm5
3382; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3383; SSE-NEXT:    movdqa (%rdi), %xmm12
3384; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3385; SSE-NEXT:    movdqa 16(%rdi), %xmm13
3386; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3387; SSE-NEXT:    movdqa 48(%rdi), %xmm9
3388; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3389; SSE-NEXT:    movdqa 528(%rdi), %xmm7
3390; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3391; SSE-NEXT:    movdqa 544(%rdi), %xmm3
3392; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3393; SSE-NEXT:    movdqa 480(%rdi), %xmm8
3394; SSE-NEXT:    movdqa 496(%rdi), %xmm4
3395; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3396; SSE-NEXT:    movdqa 144(%rdi), %xmm10
3397; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3398; SSE-NEXT:    movdqa 160(%rdi), %xmm2
3399; SSE-NEXT:    movdqa 96(%rdi), %xmm1
3400; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3401; SSE-NEXT:    movdqa 112(%rdi), %xmm11
3402; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3]
3403; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3404; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3405; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
3406; SSE-NEXT:    movdqa %xmm2, %xmm6
3407; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3408; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1]
3409; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3410; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3411; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3412; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
3413; SSE-NEXT:    movdqa %xmm8, %xmm1
3414; SSE-NEXT:    movdqa %xmm8, %xmm4
3415; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3416; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3417; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
3418; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1]
3419; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3420; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3421; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3422; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3]
3423; SSE-NEXT:    movdqa %xmm12, %xmm1
3424; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3425; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
3426; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1]
3427; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3428; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3429; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3430; SSE-NEXT:    movdqa 384(%rdi), %xmm1
3431; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3432; SSE-NEXT:    movdqa 400(%rdi), %xmm14
3433; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3]
3434; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3435; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3436; SSE-NEXT:    movdqa 432(%rdi), %xmm3
3437; SSE-NEXT:    movdqa %xmm3, (%rsp) # 16-byte Spill
3438; SSE-NEXT:    movdqa 448(%rdi), %xmm9
3439; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
3440; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3441; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
3442; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3443; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3444; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3445; SSE-NEXT:    movdqa 288(%rdi), %xmm1
3446; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3447; SSE-NEXT:    movdqa 304(%rdi), %xmm15
3448; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3]
3449; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3450; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3451; SSE-NEXT:    movdqa 336(%rdi), %xmm3
3452; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3453; SSE-NEXT:    movdqa 352(%rdi), %xmm12
3454; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
3455; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3456; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
3457; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3458; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3459; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3460; SSE-NEXT:    movdqa 672(%rdi), %xmm1
3461; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3462; SSE-NEXT:    movdqa 688(%rdi), %xmm8
3463; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
3464; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3465; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3466; SSE-NEXT:    movdqa 720(%rdi), %xmm3
3467; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3468; SSE-NEXT:    movdqa 736(%rdi), %xmm5
3469; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
3470; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3471; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
3472; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3473; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3474; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3475; SSE-NEXT:    movdqa 192(%rdi), %xmm1
3476; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3477; SSE-NEXT:    movdqa 208(%rdi), %xmm0
3478; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3479; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3480; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3481; SSE-NEXT:    movdqa 240(%rdi), %xmm2
3482; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3483; SSE-NEXT:    movdqa 256(%rdi), %xmm0
3484; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3485; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
3486; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
3487; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3488; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3489; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3490; SSE-NEXT:    movdqa 576(%rdi), %xmm7
3491; SSE-NEXT:    movdqa 592(%rdi), %xmm13
3492; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3]
3493; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3494; SSE-NEXT:    movdqa %xmm7, %xmm1
3495; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3496; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3497; SSE-NEXT:    movdqa 624(%rdi), %xmm10
3498; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3499; SSE-NEXT:    movdqa 640(%rdi), %xmm3
3500; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
3501; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3502; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1]
3503; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3504; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3505; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3506; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3507; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
3508; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3]
3509; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3510; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3]
3511; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3512; SSE-NEXT:    movdqa %xmm10, %xmm2
3513; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3514; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3515; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3516; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
3517; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3518; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3519; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3520; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3521; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3522; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3523; SSE-NEXT:    movdqa %xmm11, %xmm2
3524; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3525; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3526; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3527; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3528; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
3529; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3530; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3531; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3532; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3533; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3534; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3535; SSE-NEXT:    movdqa %xmm4, %xmm2
3536; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3537; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3538; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3539; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3540; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
3541; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3]
3542; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3543; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3]
3544; SSE-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
3545; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3546; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3547; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3548; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3549; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
3550; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3]
3551; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3552; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
3553; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3554; SSE-NEXT:    movdqa %xmm15, %xmm2
3555; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3556; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3557; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3558; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3559; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
3560; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3]
3561; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3562; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
3563; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3564; SSE-NEXT:    movdqa %xmm8, %xmm2
3565; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3566; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3567; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3568; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3569; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1]
3570; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3571; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3572; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3573; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3574; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3575; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3576; SSE-NEXT:    movdqa %xmm14, %xmm5
3577; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
3578; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
3579; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3580; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
3581; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3]
3582; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3583; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
3584; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3585; SSE-NEXT:    movdqa %xmm5, %xmm3
3586; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3587; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
3588; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3589; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3]
3590; SSE-NEXT:    movdqa 176(%rdi), %xmm0
3591; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3592; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3593; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3594; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3595; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
3596; SSE-NEXT:    movdqa 128(%rdi), %xmm2
3597; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3598; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3599; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3600; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3601; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3]
3602; SSE-NEXT:    movdqa 80(%rdi), %xmm0
3603; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3604; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3605; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3606; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
3607; SSE-NEXT:    movdqa 32(%rdi), %xmm13
3608; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
3609; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3610; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3611; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3]
3612; SSE-NEXT:    movdqa 368(%rdi), %xmm0
3613; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3614; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3615; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3616; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
3617; SSE-NEXT:    movdqa 320(%rdi), %xmm2
3618; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3619; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3620; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3621; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3622; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3]
3623; SSE-NEXT:    movdqa 272(%rdi), %xmm0
3624; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3625; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3626; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3627; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3]
3628; SSE-NEXT:    movdqa %xmm12, %xmm14
3629; SSE-NEXT:    movdqa 224(%rdi), %xmm6
3630; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
3631; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3632; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3633; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3634; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3]
3635; SSE-NEXT:    movdqa 560(%rdi), %xmm0
3636; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3637; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3638; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3639; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3640; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
3641; SSE-NEXT:    movdqa 512(%rdi), %xmm2
3642; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3643; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3644; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3645; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3646; SSE-NEXT:    movdqa (%rsp), %xmm11 # 16-byte Reload
3647; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3]
3648; SSE-NEXT:    movdqa 464(%rdi), %xmm15
3649; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1]
3650; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3651; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3652; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
3653; SSE-NEXT:    movdqa 416(%rdi), %xmm4
3654; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
3655; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3656; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3657; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3658; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3]
3659; SSE-NEXT:    movdqa 752(%rdi), %xmm0
3660; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3661; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3662; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3663; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3664; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
3665; SSE-NEXT:    movdqa 704(%rdi), %xmm12
3666; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
3667; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3668; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3669; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3670; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3]
3671; SSE-NEXT:    movdqa 656(%rdi), %xmm0
3672; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3673; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3674; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3675; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3676; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
3677; SSE-NEXT:    movdqa 608(%rdi), %xmm5
3678; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
3679; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3680; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3681; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3682; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
3683; SSE-NEXT:    movdqa %xmm13, %xmm9
3684; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3685; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1]
3686; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3687; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3688; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3689; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3690; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
3691; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3692; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3693; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3]
3694; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3695; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
3696; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3697; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3698; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3699; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3700; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
3701; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3702; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3703; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3]
3704; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
3705; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3706; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3707; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3708; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3709; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
3710; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3711; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3712; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3713; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
3714; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3715; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
3716; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3717; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3718; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3719; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3720; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
3721; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3722; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3723; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
3724; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
3725; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3726; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
3727; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3728; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
3729; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3730; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3731; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3732; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
3733; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3734; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1]
3735; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3736; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3737; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3738; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3739; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
3740; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3741; SSE-NEXT:    movapd %xmm1, (%rsp) # 16-byte Spill
3742; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
3743; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3744; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
3745; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3746; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3747; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3748; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3749; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3750; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3751; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3752; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3753; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
3754; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
3755; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3756; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3757; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3758; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3759; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
3760; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3761; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3762; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
3763; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3764; SSE-NEXT:    movdqa %xmm12, %xmm1
3765; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3766; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
3767; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3768; SSE-NEXT:    # xmm9 = mem[0,0,1,1]
3769; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
3770; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
3771; SSE-NEXT:    movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3772; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
3773; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3774; SSE-NEXT:    movdqa %xmm13, %xmm1
3775; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3776; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
3777; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3778; SSE-NEXT:    # xmm3 = mem[0,0,1,1]
3779; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
3780; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
3781; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3782; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3783; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
3784; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3785; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3786; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3]
3787; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3788; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm14[0,0,1,1]
3789; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
3790; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
3791; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
3792; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3793; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3794; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
3795; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3796; SSE-NEXT:    # xmm8 = mem[0,0,1,1]
3797; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
3798; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
3799; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3800; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
3801; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3802; SSE-NEXT:    movdqa %xmm10, %xmm1
3803; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3804; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
3805; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3806; SSE-NEXT:    # xmm6 = mem[0,0,1,1]
3807; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
3808; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
3809; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
3810; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3811; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3812; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3]
3813; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3814; SSE-NEXT:    # xmm4 = mem[0,0,1,1]
3815; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
3816; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
3817; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
3818; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3819; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3820; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
3821; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3822; SSE-NEXT:    # xmm3 = mem[0,0,1,1]
3823; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3824; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
3825; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3826; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
3827; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3828; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3829; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
3830; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3831; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
3832; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3833; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3834; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1]
3835; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3836; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3837; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3838; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3839; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3840; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3841; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
3842; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
3843; SSE-NEXT:    movapd %xmm15, %xmm7
3844; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
3845; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3846; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3847; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3848; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3849; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3850; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3851; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
3852; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
3853; SSE-NEXT:    movapd %xmm15, %xmm11
3854; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3855; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
3856; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3857; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3858; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3859; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3860; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3861; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
3862; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
3863; SSE-NEXT:    movapd %xmm14, %xmm12
3864; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3865; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
3866; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3867; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3868; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3869; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3870; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3871; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3872; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
3873; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
3874; SSE-NEXT:    movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3875; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
3876; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3877; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3878; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3879; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3880; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3881; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3882; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
3883; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
3884; SSE-NEXT:    movapd %xmm15, %xmm10
3885; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3886; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
3887; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3888; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3889; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3890; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3891; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3892; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3893; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
3894; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
3895; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3896; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
3897; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3898; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
3899; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3900; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3901; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3902; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3903; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
3904; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
3905; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3906; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
3907; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
3908; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3909; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3910; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
3911; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3912; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
3913; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
3914; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3915; SSE-NEXT:    movaps %xmm0, 96(%rsi)
3916; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3917; SSE-NEXT:    movaps %xmm0, 32(%rsi)
3918; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3919; SSE-NEXT:    movaps %xmm0, 112(%rsi)
3920; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3921; SSE-NEXT:    movaps %xmm0, 48(%rsi)
3922; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3923; SSE-NEXT:    movaps %xmm0, 64(%rsi)
3924; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3925; SSE-NEXT:    movaps %xmm0, (%rsi)
3926; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3927; SSE-NEXT:    movaps %xmm0, 80(%rsi)
3928; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3929; SSE-NEXT:    movaps %xmm0, 16(%rsi)
3930; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3931; SSE-NEXT:    movaps %xmm0, 96(%rdx)
3932; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3933; SSE-NEXT:    movaps %xmm0, 32(%rdx)
3934; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3935; SSE-NEXT:    movaps %xmm0, 112(%rdx)
3936; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3937; SSE-NEXT:    movaps %xmm0, 48(%rdx)
3938; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3939; SSE-NEXT:    movaps %xmm0, 64(%rdx)
3940; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3941; SSE-NEXT:    movaps %xmm0, (%rdx)
3942; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3943; SSE-NEXT:    movaps %xmm0, 80(%rdx)
3944; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3945; SSE-NEXT:    movaps %xmm0, 16(%rdx)
3946; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3947; SSE-NEXT:    movaps %xmm0, 96(%rcx)
3948; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3949; SSE-NEXT:    movaps %xmm0, 112(%rcx)
3950; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3951; SSE-NEXT:    movaps %xmm0, 64(%rcx)
3952; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3953; SSE-NEXT:    movaps %xmm0, 80(%rcx)
3954; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3955; SSE-NEXT:    movaps %xmm0, 32(%rcx)
3956; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3957; SSE-NEXT:    movaps %xmm0, 48(%rcx)
3958; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3959; SSE-NEXT:    movaps %xmm0, (%rcx)
3960; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3961; SSE-NEXT:    movaps %xmm0, 16(%rcx)
3962; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3963; SSE-NEXT:    movaps %xmm0, 112(%r8)
3964; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3965; SSE-NEXT:    movaps %xmm0, 96(%r8)
3966; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
3967; SSE-NEXT:    movaps %xmm0, 80(%r8)
3968; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3969; SSE-NEXT:    movaps %xmm0, 64(%r8)
3970; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3971; SSE-NEXT:    movaps %xmm0, 48(%r8)
3972; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3973; SSE-NEXT:    movaps %xmm0, 32(%r8)
3974; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3975; SSE-NEXT:    movaps %xmm0, 16(%r8)
3976; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3977; SSE-NEXT:    movaps %xmm0, (%r8)
3978; SSE-NEXT:    movapd %xmm2, 112(%r9)
3979; SSE-NEXT:    movapd %xmm3, 96(%r9)
3980; SSE-NEXT:    movapd %xmm4, 80(%r9)
3981; SSE-NEXT:    movapd %xmm6, 64(%r9)
3982; SSE-NEXT:    movapd %xmm8, 48(%r9)
3983; SSE-NEXT:    movapd %xmm9, 32(%r9)
3984; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3985; SSE-NEXT:    movaps %xmm0, 16(%r9)
3986; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3987; SSE-NEXT:    movaps %xmm0, (%r9)
3988; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3989; SSE-NEXT:    movapd %xmm14, 112(%rax)
3990; SSE-NEXT:    movapd %xmm13, 96(%rax)
3991; SSE-NEXT:    movapd %xmm15, 80(%rax)
3992; SSE-NEXT:    movapd %xmm10, 64(%rax)
3993; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3994; SSE-NEXT:    movaps %xmm0, 48(%rax)
3995; SSE-NEXT:    movapd %xmm12, 32(%rax)
3996; SSE-NEXT:    movapd %xmm11, 16(%rax)
3997; SSE-NEXT:    movapd %xmm7, (%rax)
3998; SSE-NEXT:    addq $1032, %rsp # imm = 0x408
3999; SSE-NEXT:    retq
4000;
4001; AVX-LABEL: load_i32_stride6_vf32:
4002; AVX:       # %bb.0:
4003; AVX-NEXT:    subq $1032, %rsp # imm = 0x408
4004; AVX-NEXT:    vmovaps 416(%rdi), %ymm9
4005; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4006; AVX-NEXT:    vmovaps 480(%rdi), %ymm4
4007; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4008; AVX-NEXT:    vmovaps 448(%rdi), %ymm5
4009; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4010; AVX-NEXT:    vmovapd 160(%rdi), %ymm2
4011; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4012; AVX-NEXT:    vmovapd 128(%rdi), %ymm3
4013; AVX-NEXT:    vmovupd %ymm3, (%rsp) # 32-byte Spill
4014; AVX-NEXT:    vmovaps 32(%rdi), %ymm6
4015; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4016; AVX-NEXT:    vmovaps (%rdi), %ymm7
4017; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4018; AVX-NEXT:    vmovaps 96(%rdi), %ymm1
4019; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4020; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
4021; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4022; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm0, %ymm8
4023; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm8[0,0],ymm1[6,4],ymm8[4,4]
4024; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,2],ymm0[6,4],ymm8[6,6]
4025; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
4026; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm7
4027; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm7[2,3]
4028; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3]
4029; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4030; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm2[0,1]
4031; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4032; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
4033; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
4034; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4035; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4036; AVX-NEXT:    vinsertf128 $1, 480(%rdi), %ymm5, %ymm6
4037; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm6[0,0],ymm4[6,4],ymm6[4,4]
4038; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6]
4039; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
4040; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4041; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm9[4,5],ymm1[6,7]
4042; AVX-NEXT:    vextractf128 $1, %ymm13, %xmm5
4043; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm5[2,3]
4044; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3]
4045; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4046; AVX-NEXT:    vmovapd 544(%rdi), %ymm1
4047; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4048; AVX-NEXT:    vmovapd 512(%rdi), %ymm2
4049; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4050; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
4051; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4052; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
4053; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
4054; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4055; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4056; AVX-NEXT:    vmovaps 288(%rdi), %ymm1
4057; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4058; AVX-NEXT:    vmovaps 256(%rdi), %ymm0
4059; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4060; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm0, %ymm3
4061; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[0,0],ymm1[6,4],ymm3[4,4]
4062; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,2],ymm0[6,4],ymm3[6,6]
4063; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
4064; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4065; AVX-NEXT:    vmovaps 192(%rdi), %ymm2
4066; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4067; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
4068; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm2
4069; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm2[2,3]
4070; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3]
4071; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4072; AVX-NEXT:    vmovapd 352(%rdi), %ymm1
4073; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4074; AVX-NEXT:    vmovapd 320(%rdi), %ymm4
4075; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4076; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[0,1]
4077; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4078; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[3],ymm4[2]
4079; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
4080; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4081; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4082; AVX-NEXT:    vmovaps 672(%rdi), %ymm1
4083; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4084; AVX-NEXT:    vmovaps 640(%rdi), %ymm0
4085; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4086; AVX-NEXT:    vinsertf128 $1, 672(%rdi), %ymm0, %ymm9
4087; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4]
4088; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6]
4089; AVX-NEXT:    vmovaps 608(%rdi), %ymm0
4090; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4091; AVX-NEXT:    vmovaps 576(%rdi), %ymm1
4092; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4093; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
4094; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
4095; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm4[0,1],xmm1[2,3]
4096; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[0,3]
4097; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3,4,5,6,7]
4098; AVX-NEXT:    vmovapd 736(%rdi), %ymm10
4099; AVX-NEXT:    vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4100; AVX-NEXT:    vmovapd 704(%rdi), %ymm0
4101; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4102; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[0,1]
4103; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[1],ymm10[3],ymm0[2]
4104; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
4105; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
4106; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4107; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
4108; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[3,0],ymm8[1,0],ymm15[7,4],ymm8[5,4]
4109; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm14[2,0],ymm8[2,3],ymm14[6,4],ymm8[6,7]
4110; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[1,0],xmm7[3,0]
4111; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm12[0,2],xmm7[1,3]
4112; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7]
4113; AVX-NEXT:    vmovups (%rsp), %ymm12 # 32-byte Reload
4114; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4115; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7]
4116; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
4117; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
4118; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4119; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4120; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4]
4121; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7]
4122; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm13[1,0],xmm5[3,0]
4123; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm7[0,2],xmm5[1,3]
4124; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7]
4125; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
4126; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4127; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm14[1,3],ymm0[7,5],ymm14[5,7]
4128; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
4129; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
4130; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4131; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4132; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[3,0],ymm3[1,0],ymm0[7,4],ymm3[5,4]
4133; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7]
4134; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm11[1,0],xmm2[3,0]
4135; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm5[0,2],xmm2[1,3]
4136; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
4137; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4138; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
4139; AVX-NEXT:    # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7]
4140; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
4141; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
4142; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4143; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
4144; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm9[1,0],ymm11[7,4],ymm9[5,4]
4145; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm9[2,3],ymm2[6,4],ymm9[6,7]
4146; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm4[1,0],xmm1[3,0]
4147; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm1[1,3]
4148; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
4149; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
4150; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm13[1,3],ymm10[7,5],ymm13[5,7]
4151; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
4152; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4153; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4154; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4155; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
4156; AVX-NEXT:    # ymm4 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
4157; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
4158; AVX-NEXT:    # ymm1 = ymm15[2,1],mem[2,0],ymm15[6,5],mem[6,4]
4159; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
4160; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm0
4161; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4162; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm0[2,3]
4163; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
4164; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
4165; AVX-NEXT:    # ymm3 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
4166; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
4167; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4168; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm3[2,0],ymm0[4,4],ymm3[6,4]
4169; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
4170; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4171; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4172; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
4173; AVX-NEXT:    # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
4174; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
4175; AVX-NEXT:    # ymm5 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4]
4176; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
4177; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm0
4178; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4179; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm6[2,0],xmm0[2,3]
4180; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
4181; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
4182; AVX-NEXT:    # ymm2 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7]
4183; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
4184; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4185; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4]
4186; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7]
4187; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4188; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4189; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
4190; AVX-NEXT:    # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
4191; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload
4192; AVX-NEXT:    # ymm7 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4]
4193; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1]
4194; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm0
4195; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4196; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm9[2,0],xmm0[2,3]
4197; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7]
4198; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload
4199; AVX-NEXT:    # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7]
4200; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1]
4201; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4]
4202; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7]
4203; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4204; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4205; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
4206; AVX-NEXT:    # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
4207; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4208; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4209; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm5[2,1],ymm8[2,0],ymm5[6,5],ymm8[6,4]
4210; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1]
4211; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm14
4212; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3]
4213; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
4214; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4215; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
4216; AVX-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
4217; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1]
4218; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4]
4219; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7]
4220; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4221; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4222; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm3[3,1],ymm1[4,5],ymm3[7,5]
4223; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
4224; AVX-NEXT:    # xmm3 = xmm4[3,1],mem[3,3]
4225; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
4226; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
4227; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm13[2,1],ymm11[7,5],ymm13[6,5]
4228; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
4229; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
4230; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
4231; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4232; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4233; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[3,1],ymm1[4,5],ymm2[7,5]
4234; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
4235; AVX-NEXT:    # xmm2 = xmm6[3,1],mem[3,3]
4236; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4237; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4238; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm6[2,1],ymm4[7,5],ymm6[6,5]
4239; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
4240; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
4241; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
4242; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4243; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5]
4244; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3]
4245; AVX-NEXT:    vmovaps %ymm5, %ymm3
4246; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm8[2,1],ymm5[7,5],ymm8[6,5]
4247; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
4248; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
4249; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
4250; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4251; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5]
4252; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload
4253; AVX-NEXT:    # xmm1 = xmm9[3,1],mem[3,3]
4254; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
4255; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4256; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm12[2,1],ymm2[7,5],ymm12[6,5]
4257; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
4258; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
4259; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
4260; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4261; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4262; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
4263; AVX-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
4264; AVX-NEXT:    vmovaps 416(%rdi), %xmm0
4265; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4266; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
4267; AVX-NEXT:    vmovaps 400(%rdi), %xmm1
4268; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4269; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
4270; AVX-NEXT:    vmovapd 464(%rdi), %xmm1
4271; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4272; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
4273; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
4274; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
4275; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4276; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
4277; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4]
4278; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4]
4279; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4280; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4281; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4282; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
4283; AVX-NEXT:    # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
4284; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
4285; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4286; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
4287; AVX-NEXT:    vmovaps 16(%rdi), %xmm15
4288; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3]
4289; AVX-NEXT:    vmovapd 80(%rdi), %xmm10
4290; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[3]
4291; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,0],ymm11[4,5],ymm1[6,4]
4292; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
4293; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1]
4294; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4]
4295; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4]
4296; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
4297; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4298; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4299; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
4300; AVX-NEXT:    # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
4301; AVX-NEXT:    vmovaps 224(%rdi), %xmm0
4302; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4303; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
4304; AVX-NEXT:    vmovaps 208(%rdi), %xmm13
4305; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
4306; AVX-NEXT:    vmovapd 272(%rdi), %xmm2
4307; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4308; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[3]
4309; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
4310; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
4311; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1]
4312; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm2[0,0],ymm5[6,4],ymm2[4,4]
4313; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4]
4314; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
4315; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4316; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4317; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
4318; AVX-NEXT:    # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
4319; AVX-NEXT:    vmovaps 608(%rdi), %xmm11
4320; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm11[2,2,3,3]
4321; AVX-NEXT:    vmovaps 592(%rdi), %xmm8
4322; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3]
4323; AVX-NEXT:    vmovapd 656(%rdi), %xmm9
4324; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[3]
4325; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4326; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2,0],ymm4[4,5],ymm14[6,4]
4327; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7]
4328; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
4329; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm3[2,0],ymm0[0,0],ymm3[6,4],ymm0[4,4]
4330; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
4331; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm12[5,6,7]
4332; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4]
4333; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm6[2,0],ymm1[4,7],ymm6[6,4]
4334; AVX-NEXT:    vblendps $12, (%rsp), %xmm15, %xmm6 # 16-byte Folded Reload
4335; AVX-NEXT:    # xmm6 = xmm15[0,1],mem[2,3]
4336; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
4337; AVX-NEXT:    # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
4338; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
4339; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4]
4340; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3]
4341; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3,4,5,6,7]
4342; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm1[5,6,7]
4343; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4344; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm7[1,0],ymm1[7,4],ymm7[5,4]
4345; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,3],ymm1[2,0],ymm7[4,7],ymm1[6,4]
4346; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4347; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
4348; AVX-NEXT:    # xmm7 = xmm7[0,1],mem[2,3]
4349; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4350; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
4351; AVX-NEXT:    # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
4352; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
4353; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4]
4354; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3]
4355; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5,6,7]
4356; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5,6,7]
4357; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm2[1,0],ymm5[7,4],ymm2[5,4]
4358; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,3],ymm1[2,0],ymm2[4,7],ymm1[6,4]
4359; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload
4360; AVX-NEXT:    # xmm2 = xmm13[0,1],mem[2,3]
4361; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4362; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4363; AVX-NEXT:    # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7]
4364; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4365; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm10[1,1],ymm5[2,0],ymm10[5,5],ymm5[6,4]
4366; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
4367; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7]
4368; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
4369; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4]
4370; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4]
4371; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm11[2,3]
4372; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
4373; AVX-NEXT:    # ymm3 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7]
4374; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[2,0],ymm4[5,5],ymm3[6,4]
4375; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
4376; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
4377; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
4378; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4379; AVX-NEXT:    vmovaps %ymm2, 96(%rsi)
4380; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4381; AVX-NEXT:    vmovaps %ymm2, 32(%rsi)
4382; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4383; AVX-NEXT:    vmovaps %ymm2, 64(%rsi)
4384; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4385; AVX-NEXT:    vmovaps %ymm2, (%rsi)
4386; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4387; AVX-NEXT:    vmovaps %ymm2, 96(%rdx)
4388; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4389; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)
4390; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4391; AVX-NEXT:    vmovaps %ymm2, 64(%rdx)
4392; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4393; AVX-NEXT:    vmovaps %ymm2, (%rdx)
4394; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4395; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
4396; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4397; AVX-NEXT:    vmovaps %ymm2, 96(%rcx)
4398; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4399; AVX-NEXT:    vmovaps %ymm2, 64(%rcx)
4400; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4401; AVX-NEXT:    vmovaps %ymm2, (%rcx)
4402; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4403; AVX-NEXT:    vmovaps %ymm2, 96(%r8)
4404; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4405; AVX-NEXT:    vmovaps %ymm2, 32(%r8)
4406; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4407; AVX-NEXT:    vmovaps %ymm2, 64(%r8)
4408; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4409; AVX-NEXT:    vmovaps %ymm2, (%r8)
4410; AVX-NEXT:    vmovaps %ymm14, 96(%r9)
4411; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4412; AVX-NEXT:    vmovaps %ymm2, 32(%r9)
4413; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4414; AVX-NEXT:    vmovaps %ymm2, (%r9)
4415; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4416; AVX-NEXT:    vmovaps %ymm2, 64(%r9)
4417; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4418; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
4419; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
4420; AVX-NEXT:    vmovaps %ymm7, 64(%rax)
4421; AVX-NEXT:    vmovaps %ymm6, (%rax)
4422; AVX-NEXT:    addq $1032, %rsp # imm = 0x408
4423; AVX-NEXT:    vzeroupper
4424; AVX-NEXT:    retq
4425;
4426; AVX2-LABEL: load_i32_stride6_vf32:
4427; AVX2:       # %bb.0:
4428; AVX2-NEXT:    subq $1224, %rsp # imm = 0x4C8
4429; AVX2-NEXT:    vmovaps 480(%rdi), %ymm9
4430; AVX2-NEXT:    vmovaps 448(%rdi), %ymm11
4431; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4432; AVX2-NEXT:    vmovaps 416(%rdi), %ymm8
4433; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4434; AVX2-NEXT:    vmovaps 128(%rdi), %ymm2
4435; AVX2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
4436; AVX2-NEXT:    vmovaps 160(%rdi), %ymm3
4437; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4438; AVX2-NEXT:    vmovaps 96(%rdi), %ymm4
4439; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4440; AVX2-NEXT:    vmovaps (%rdi), %ymm0
4441; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4442; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
4443; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4444; AVX2-NEXT:    vmovaps 64(%rdi), %ymm5
4445; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4446; AVX2-NEXT:    vmovaps {{.*#+}} xmm10 = [0,6,4,u]
4447; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
4448; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4449; AVX2-NEXT:    vpermps %ymm0, %ymm10, %ymm0
4450; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
4451; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm4[6,7]
4452; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6]
4453; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4454; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7]
4455; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2]
4456; AVX2-NEXT:    vpermps %ymm7, %ymm6, %ymm1
4457; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4458; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4459; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4460; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm9[0,1]
4461; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7]
4462; AVX2-NEXT:    vmovaps 384(%rdi), %ymm0
4463; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4464; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
4465; AVX2-NEXT:    vpermps %ymm3, %ymm10, %ymm0
4466; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
4467; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4468; AVX2-NEXT:    vmovaps 512(%rdi), %ymm1
4469; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4470; AVX2-NEXT:    vmovaps 544(%rdi), %ymm2
4471; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4472; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4473; AVX2-NEXT:    vpermps %ymm2, %ymm6, %ymm1
4474; AVX2-NEXT:    vmovaps %ymm6, %ymm9
4475; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4476; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4477; AVX2-NEXT:    vmovaps 288(%rdi), %ymm1
4478; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4479; AVX2-NEXT:    vmovaps 256(%rdi), %ymm0
4480; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4481; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
4482; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4483; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
4484; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4485; AVX2-NEXT:    vmovaps 192(%rdi), %ymm6
4486; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4487; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
4488; AVX2-NEXT:    vpermps %ymm1, %ymm10, %ymm8
4489; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[0,2,2,2,4,6,6,6]
4490; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7]
4491; AVX2-NEXT:    vmovaps 320(%rdi), %ymm6
4492; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4493; AVX2-NEXT:    vmovaps 352(%rdi), %ymm8
4494; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4495; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
4496; AVX2-NEXT:    vpermps %ymm8, %ymm9, %ymm14
4497; AVX2-NEXT:    vmovaps %ymm9, %ymm0
4498; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4499; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm14[6,7]
4500; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4501; AVX2-NEXT:    vmovaps 608(%rdi), %ymm6
4502; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4503; AVX2-NEXT:    vmovaps 576(%rdi), %ymm9
4504; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4505; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7]
4506; AVX2-NEXT:    vpermps %ymm14, %ymm10, %ymm10
4507; AVX2-NEXT:    vmovaps 672(%rdi), %ymm6
4508; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4509; AVX2-NEXT:    vmovaps 640(%rdi), %ymm9
4510; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4511; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm6[0,1]
4512; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7]
4513; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm12[0,2,2,2,4,6,6,6]
4514; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7]
4515; AVX2-NEXT:    vmovaps 704(%rdi), %ymm6
4516; AVX2-NEXT:    vmovaps 736(%rdi), %ymm11
4517; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm6[4,5,6,7]
4518; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4519; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4520; AVX2-NEXT:    vpermps %ymm15, %ymm0, %ymm9
4521; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
4522; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4523; AVX2-NEXT:    vmovaps {{.*#+}} xmm9 = [1,7,5,u]
4524; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
4525; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7]
4526; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7]
4527; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm10 = [5,3,5,3,5,3,5,3]
4528; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4529; AVX2-NEXT:    vpermps %ymm7, %ymm10, %ymm7
4530; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
4531; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4532; AVX2-NEXT:    vpermps %ymm3, %ymm9, %ymm3
4533; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7]
4534; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
4535; AVX2-NEXT:    vpermps %ymm2, %ymm10, %ymm2
4536; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4537; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4538; AVX2-NEXT:    vpermps %ymm1, %ymm9, %ymm0
4539; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,3,2,3,5,7,6,7]
4540; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4541; AVX2-NEXT:    vpermps %ymm8, %ymm10, %ymm1
4542; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4543; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4544; AVX2-NEXT:    vpermps %ymm14, %ymm9, %ymm0
4545; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7]
4546; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4547; AVX2-NEXT:    vpermps %ymm15, %ymm10, %ymm1
4548; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4549; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4550; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
4551; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
4552; AVX2-NEXT:    # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7]
4553; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
4554; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
4555; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4556; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
4557; AVX2-NEXT:    # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
4558; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[2,0,2,3,6,4,6,7]
4559; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
4560; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4561; AVX2-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
4562; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
4563; AVX2-NEXT:    # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
4564; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,0,2,0,4,4,6,4]
4565; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
4566; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4567; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4568; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4569; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4570; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
4571; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
4572; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
4573; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4574; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
4575; AVX2-NEXT:    # ymm15 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
4576; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[2,0,2,3,6,4,6,7]
4577; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
4578; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4579; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4580; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
4581; AVX2-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
4582; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0,2,0,4,4,6,4]
4583; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
4584; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4585; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4586; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
4587; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
4588; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
4589; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
4590; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,2,0,3]
4591; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4592; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4593; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
4594; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[2,0,2,3,6,4,6,7]
4595; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3]
4596; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm1[3,4,5,6,7]
4597; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
4598; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[0,0,2,0,4,4,6,4]
4599; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
4600; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm8[5,6,7]
4601; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4602; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4603; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
4604; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7]
4605; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
4606; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3]
4607; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4608; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
4609; AVX2-NEXT:    # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7]
4610; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm8[2,0,2,3,6,4,6,7]
4611; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
4612; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7]
4613; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4614; AVX2-NEXT:    vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
4615; AVX2-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7]
4616; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[0,0,2,0,4,4,6,4]
4617; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
4618; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm10[5,6,7]
4619; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4620; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7]
4621; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4622; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
4623; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3]
4624; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,1,3,3,7,5,7,7]
4625; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
4626; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7]
4627; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,7,5]
4628; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
4629; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
4630; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4631; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4632; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7]
4633; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4634; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7]
4635; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
4636; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[3,1,3,3,7,5,7,7]
4637; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
4638; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
4639; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,7,5]
4640; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
4641; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
4642; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4643; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
4644; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7]
4645; AVX2-NEXT:    vmovaps %ymm6, %ymm4
4646; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
4647; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[3,1,3,3,7,5,7,7]
4648; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
4649; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
4650; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,7,5]
4651; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
4652; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
4653; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4654; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
4655; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7]
4656; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
4657; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7]
4658; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
4659; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
4660; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,7,5]
4661; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
4662; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4663; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4664; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7]
4665; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4666; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4667; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
4668; AVX2-NEXT:    # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7]
4669; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4670; AVX2-NEXT:    vmovaps 464(%rdi), %xmm0
4671; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4672; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
4673; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
4674; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4675; AVX2-NEXT:    vpermps %ymm3, %ymm8, %ymm1
4676; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
4677; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4678; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
4679; AVX2-NEXT:    # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
4680; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6]
4681; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
4682; AVX2-NEXT:    vpermps %ymm7, %ymm3, %ymm1
4683; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4684; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4685; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7]
4686; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4687; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
4688; AVX2-NEXT:    # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7]
4689; AVX2-NEXT:    vmovaps 80(%rdi), %xmm5
4690; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
4691; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
4692; AVX2-NEXT:    vpermps %ymm14, %ymm8, %ymm1
4693; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
4694; AVX2-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
4695; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
4696; AVX2-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
4697; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm1
4698; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4699; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4700; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm11[4,5,6,7]
4701; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4702; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
4703; AVX2-NEXT:    # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7]
4704; AVX2-NEXT:    vmovaps 272(%rdi), %xmm4
4705; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7]
4706; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
4707; AVX2-NEXT:    vpermps %ymm12, %ymm8, %ymm11
4708; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3,4,5,6,7]
4709; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4710; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
4711; AVX2-NEXT:    # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
4712; AVX2-NEXT:    vpermps %ymm1, %ymm3, %ymm15
4713; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm15[5,6,7]
4714; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
4715; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
4716; AVX2-NEXT:    # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7]
4717; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4718; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
4719; AVX2-NEXT:    # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7]
4720; AVX2-NEXT:    vmovaps 656(%rdi), %xmm0
4721; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7]
4722; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6]
4723; AVX2-NEXT:    vpermps %ymm15, %ymm8, %ymm8
4724; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7]
4725; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4726; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
4727; AVX2-NEXT:    # ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7]
4728; AVX2-NEXT:    vpermps %ymm10, %ymm3, %ymm3
4729; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7]
4730; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5]
4731; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7]
4732; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
4733; AVX2-NEXT:    vpermps %ymm14, %ymm13, %ymm6
4734; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7]
4735; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7]
4736; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
4737; AVX2-NEXT:    vpermps %ymm2, %ymm6, %ymm2
4738; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
4739; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
4740; AVX2-NEXT:    # ymm5 = mem[1,1,1,1,5,5,5,5]
4741; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4742; AVX2-NEXT:    # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7]
4743; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
4744; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
4745; AVX2-NEXT:    vpermps %ymm7, %ymm6, %ymm7
4746; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7]
4747; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5]
4748; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7]
4749; AVX2-NEXT:    vpermps %ymm12, %ymm13, %ymm7
4750; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7]
4751; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm1
4752; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
4753; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5]
4754; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
4755; AVX2-NEXT:    vpermps %ymm15, %ymm13, %ymm4
4756; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
4757; AVX2-NEXT:    vpermps %ymm10, %ymm6, %ymm4
4758; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
4759; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4760; AVX2-NEXT:    vmovaps %ymm4, 96(%rsi)
4761; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4762; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
4763; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4764; AVX2-NEXT:    vmovaps %ymm4, 64(%rsi)
4765; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4766; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
4767; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4768; AVX2-NEXT:    vmovaps %ymm4, 96(%rdx)
4769; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4770; AVX2-NEXT:    vmovaps %ymm4, 32(%rdx)
4771; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4772; AVX2-NEXT:    vmovaps %ymm4, 64(%rdx)
4773; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4774; AVX2-NEXT:    vmovaps %ymm4, (%rdx)
4775; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4776; AVX2-NEXT:    vmovaps %ymm4, 32(%rcx)
4777; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4778; AVX2-NEXT:    vmovaps %ymm4, 96(%rcx)
4779; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4780; AVX2-NEXT:    vmovaps %ymm4, 64(%rcx)
4781; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4782; AVX2-NEXT:    vmovaps %ymm4, (%rcx)
4783; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4784; AVX2-NEXT:    vmovaps %ymm4, 96(%r8)
4785; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4786; AVX2-NEXT:    vmovaps %ymm4, 32(%r8)
4787; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4788; AVX2-NEXT:    vmovaps %ymm4, 64(%r8)
4789; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4790; AVX2-NEXT:    vmovaps %ymm4, (%r8)
4791; AVX2-NEXT:    vmovaps %ymm3, 96(%r9)
4792; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
4793; AVX2-NEXT:    vmovaps %ymm3, 32(%r9)
4794; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4795; AVX2-NEXT:    vmovaps %ymm3, (%r9)
4796; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4797; AVX2-NEXT:    vmovaps %ymm3, 64(%r9)
4798; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4799; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
4800; AVX2-NEXT:    vmovaps %ymm1, 32(%rax)
4801; AVX2-NEXT:    vmovaps %ymm5, 64(%rax)
4802; AVX2-NEXT:    vmovaps %ymm2, (%rax)
4803; AVX2-NEXT:    addq $1224, %rsp # imm = 0x4C8
4804; AVX2-NEXT:    vzeroupper
4805; AVX2-NEXT:    retq
4806;
4807; AVX2-FP-LABEL: load_i32_stride6_vf32:
4808; AVX2-FP:       # %bb.0:
4809; AVX2-FP-NEXT:    subq $1224, %rsp # imm = 0x4C8
4810; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm9
4811; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm11
4812; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4813; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm8
4814; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4815; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm2
4816; AVX2-FP-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
4817; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm3
4818; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4819; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm4
4820; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4821; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
4822; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4823; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
4824; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4825; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm5
4826; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4827; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm10 = [0,6,4,u]
4828; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
4829; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4830; AVX2-FP-NEXT:    vpermps %ymm0, %ymm10, %ymm0
4831; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
4832; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm4[6,7]
4833; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6]
4834; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4835; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7]
4836; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2]
4837; AVX2-FP-NEXT:    vpermps %ymm7, %ymm6, %ymm1
4838; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4839; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4840; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4841; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm9[0,1]
4842; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7]
4843; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm0
4844; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4845; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
4846; AVX2-FP-NEXT:    vpermps %ymm3, %ymm10, %ymm0
4847; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
4848; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4849; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm1
4850; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4851; AVX2-FP-NEXT:    vmovaps 544(%rdi), %ymm2
4852; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4853; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4854; AVX2-FP-NEXT:    vpermps %ymm2, %ymm6, %ymm1
4855; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm9
4856; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4857; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4858; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm1
4859; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4860; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm0
4861; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4862; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
4863; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4864; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
4865; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4866; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm6
4867; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4868; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
4869; AVX2-FP-NEXT:    vpermps %ymm1, %ymm10, %ymm8
4870; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[0,2,2,2,4,6,6,6]
4871; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7]
4872; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm6
4873; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4874; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm8
4875; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4876; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
4877; AVX2-FP-NEXT:    vpermps %ymm8, %ymm9, %ymm14
4878; AVX2-FP-NEXT:    vmovaps %ymm9, %ymm0
4879; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4880; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm14[6,7]
4881; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4882; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm6
4883; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4884; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm9
4885; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4886; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7]
4887; AVX2-FP-NEXT:    vpermps %ymm14, %ymm10, %ymm10
4888; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm6
4889; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4890; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm9
4891; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4892; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm6[0,1]
4893; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7]
4894; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm12[0,2,2,2,4,6,6,6]
4895; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7]
4896; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm6
4897; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm11
4898; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm6[4,5,6,7]
4899; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4900; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4901; AVX2-FP-NEXT:    vpermps %ymm15, %ymm0, %ymm9
4902; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
4903; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4904; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm9 = [1,7,5,u]
4905; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
4906; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7]
4907; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7]
4908; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm10 = [5,3,5,3,5,3,5,3]
4909; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4910; AVX2-FP-NEXT:    vpermps %ymm7, %ymm10, %ymm7
4911; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
4912; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4913; AVX2-FP-NEXT:    vpermps %ymm3, %ymm9, %ymm3
4914; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7]
4915; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
4916; AVX2-FP-NEXT:    vpermps %ymm2, %ymm10, %ymm2
4917; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4918; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4919; AVX2-FP-NEXT:    vpermps %ymm1, %ymm9, %ymm0
4920; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,3,2,3,5,7,6,7]
4921; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4922; AVX2-FP-NEXT:    vpermps %ymm8, %ymm10, %ymm1
4923; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4924; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4925; AVX2-FP-NEXT:    vpermps %ymm14, %ymm9, %ymm0
4926; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7]
4927; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4928; AVX2-FP-NEXT:    vpermps %ymm15, %ymm10, %ymm1
4929; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4930; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4931; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
4932; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
4933; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7]
4934; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
4935; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
4936; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4937; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
4938; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
4939; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[2,0,2,3,6,4,6,7]
4940; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
4941; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4942; AVX2-FP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
4943; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
4944; AVX2-FP-NEXT:    # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
4945; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,0,2,0,4,4,6,4]
4946; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
4947; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4948; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4949; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4950; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4951; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
4952; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
4953; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
4954; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4955; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
4956; AVX2-FP-NEXT:    # ymm15 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
4957; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[2,0,2,3,6,4,6,7]
4958; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
4959; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4960; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4961; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
4962; AVX2-FP-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
4963; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0,2,0,4,4,6,4]
4964; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
4965; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4966; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4967; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
4968; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
4969; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
4970; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
4971; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,2,0,3]
4972; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4973; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4974; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
4975; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[2,0,2,3,6,4,6,7]
4976; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3]
4977; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm1[3,4,5,6,7]
4978; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
4979; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[0,0,2,0,4,4,6,4]
4980; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
4981; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm8[5,6,7]
4982; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4983; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4984; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
4985; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7]
4986; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
4987; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3]
4988; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4989; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
4990; AVX2-FP-NEXT:    # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7]
4991; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm8[2,0,2,3,6,4,6,7]
4992; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
4993; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7]
4994; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4995; AVX2-FP-NEXT:    vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
4996; AVX2-FP-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7]
4997; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[0,0,2,0,4,4,6,4]
4998; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
4999; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm10[5,6,7]
5000; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5001; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7]
5002; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5003; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
5004; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3]
5005; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,1,3,3,7,5,7,7]
5006; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
5007; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7]
5008; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,7,5]
5009; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
5010; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
5011; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5012; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5013; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7]
5014; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5015; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7]
5016; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
5017; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[3,1,3,3,7,5,7,7]
5018; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
5019; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
5020; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,7,5]
5021; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
5022; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
5023; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5024; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
5025; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7]
5026; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm4
5027; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
5028; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[3,1,3,3,7,5,7,7]
5029; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
5030; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
5031; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,7,5]
5032; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
5033; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
5034; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5035; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
5036; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7]
5037; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
5038; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7]
5039; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
5040; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5041; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,7,5]
5042; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
5043; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5044; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5045; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7]
5046; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5047; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5048; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
5049; AVX2-FP-NEXT:    # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7]
5050; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5051; AVX2-FP-NEXT:    vmovaps 464(%rdi), %xmm0
5052; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5053; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
5054; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
5055; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5056; AVX2-FP-NEXT:    vpermps %ymm3, %ymm8, %ymm1
5057; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
5058; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5059; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
5060; AVX2-FP-NEXT:    # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
5061; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6]
5062; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
5063; AVX2-FP-NEXT:    vpermps %ymm7, %ymm3, %ymm1
5064; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5065; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5066; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7]
5067; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5068; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5069; AVX2-FP-NEXT:    # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7]
5070; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm5
5071; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
5072; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
5073; AVX2-FP-NEXT:    vpermps %ymm14, %ymm8, %ymm1
5074; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
5075; AVX2-FP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
5076; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
5077; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
5078; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm1
5079; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5080; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5081; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm11[4,5,6,7]
5082; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5083; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
5084; AVX2-FP-NEXT:    # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7]
5085; AVX2-FP-NEXT:    vmovaps 272(%rdi), %xmm4
5086; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7]
5087; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
5088; AVX2-FP-NEXT:    vpermps %ymm12, %ymm8, %ymm11
5089; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3,4,5,6,7]
5090; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5091; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
5092; AVX2-FP-NEXT:    # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
5093; AVX2-FP-NEXT:    vpermps %ymm1, %ymm3, %ymm15
5094; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm15[5,6,7]
5095; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
5096; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
5097; AVX2-FP-NEXT:    # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7]
5098; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5099; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5100; AVX2-FP-NEXT:    # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7]
5101; AVX2-FP-NEXT:    vmovaps 656(%rdi), %xmm0
5102; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7]
5103; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6]
5104; AVX2-FP-NEXT:    vpermps %ymm15, %ymm8, %ymm8
5105; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7]
5106; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5107; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
5108; AVX2-FP-NEXT:    # ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7]
5109; AVX2-FP-NEXT:    vpermps %ymm10, %ymm3, %ymm3
5110; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7]
5111; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5]
5112; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7]
5113; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5114; AVX2-FP-NEXT:    vpermps %ymm14, %ymm13, %ymm6
5115; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7]
5116; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7]
5117; AVX2-FP-NEXT:    # ymm6 = mem[0,1,0,1]
5118; AVX2-FP-NEXT:    vpermps %ymm2, %ymm6, %ymm2
5119; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
5120; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5121; AVX2-FP-NEXT:    # ymm5 = mem[1,1,1,1,5,5,5,5]
5122; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
5123; AVX2-FP-NEXT:    # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7]
5124; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
5125; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
5126; AVX2-FP-NEXT:    vpermps %ymm7, %ymm6, %ymm7
5127; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7]
5128; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5]
5129; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7]
5130; AVX2-FP-NEXT:    vpermps %ymm12, %ymm13, %ymm7
5131; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7]
5132; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
5133; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
5134; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5]
5135; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
5136; AVX2-FP-NEXT:    vpermps %ymm15, %ymm13, %ymm4
5137; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
5138; AVX2-FP-NEXT:    vpermps %ymm10, %ymm6, %ymm4
5139; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
5140; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5141; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rsi)
5142; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5143; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
5144; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5145; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%rsi)
5146; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5147; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
5148; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5149; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rdx)
5150; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5151; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
5152; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5153; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%rdx)
5154; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5155; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
5156; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5157; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
5158; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5159; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rcx)
5160; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5161; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%rcx)
5162; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5163; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
5164; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5165; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%r8)
5166; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5167; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%r8)
5168; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5169; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%r8)
5170; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5171; AVX2-FP-NEXT:    vmovaps %ymm4, (%r8)
5172; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%r9)
5173; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
5174; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%r9)
5175; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5176; AVX2-FP-NEXT:    vmovaps %ymm3, (%r9)
5177; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5178; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%r9)
5179; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5180; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
5181; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rax)
5182; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rax)
5183; AVX2-FP-NEXT:    vmovaps %ymm2, (%rax)
5184; AVX2-FP-NEXT:    addq $1224, %rsp # imm = 0x4C8
5185; AVX2-FP-NEXT:    vzeroupper
5186; AVX2-FP-NEXT:    retq
5187;
5188; AVX2-FCP-LABEL: load_i32_stride6_vf32:
5189; AVX2-FCP:       # %bb.0:
5190; AVX2-FCP-NEXT:    subq $1192, %rsp # imm = 0x4A8
5191; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm6
5192; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm11
5193; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5194; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm10
5195; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5196; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm2
5197; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5198; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm3
5199; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5200; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm4
5201; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5202; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
5203; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
5204; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
5205; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5206; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm5
5207; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5208; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm9 = [0,6,4,u]
5209; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
5210; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5211; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
5212; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
5213; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7]
5214; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6]
5215; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
5216; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5217; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2]
5218; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm1
5219; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5220; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5221; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5222; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm6[0,1]
5223; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
5224; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm0
5225; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5226; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7]
5227; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm9, %ymm0
5228; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
5229; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
5230; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm1
5231; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5232; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm3
5233; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5234; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
5235; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm5, %ymm1
5236; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5237; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5238; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm1
5239; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5240; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm0
5241; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5242; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
5243; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5244; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm0
5245; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5246; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
5247; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5248; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
5249; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm0
5250; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm2[0,2,2,2,4,6,6,6]
5251; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7]
5252; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm10
5253; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5254; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm0
5255; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5256; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm10[4,5,6,7]
5257; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm5, %ymm15
5258; AVX2-FCP-NEXT:    vmovaps %ymm5, %ymm0
5259; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5260; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6,7]
5261; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5262; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm10
5263; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5264; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm11
5265; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5266; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm10[4,5],ymm11[6,7]
5267; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm9, %ymm15
5268; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm5
5269; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5270; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm9
5271; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5272; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm5[0,1]
5273; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7]
5274; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[0,2,2,2,4,6,6,6]
5275; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4,5,6,7]
5276; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm9
5277; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5278; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm10
5279; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5280; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm9[4,5,6,7]
5281; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm0, %ymm10
5282; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm10[6,7]
5283; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5284; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm10 = [1,7,5,u]
5285; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
5286; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
5287; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7]
5288; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3]
5289; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
5290; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
5291; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5292; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm10, %ymm4
5293; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
5294; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
5295; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm5, %ymm3
5296; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
5297; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5298; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm10, %ymm1
5299; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
5300; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
5301; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm5, %ymm0
5302; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5303; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5304; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm10, %ymm0
5305; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7]
5306; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
5307; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm5, %ymm1
5308; AVX2-FCP-NEXT:    vmovaps %ymm5, %ymm13
5309; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5310; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5311; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5312; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
5313; AVX2-FCP-NEXT:    # ymm0 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
5314; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7]
5315; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm7, %ymm0
5316; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm8 = [2,0,6,7]
5317; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5318; AVX2-FCP-NEXT:    vblendps $12, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload
5319; AVX2-FCP-NEXT:    # ymm3 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
5320; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm8, %ymm1
5321; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
5322; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5323; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
5324; AVX2-FCP-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
5325; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [0,0,6,4,0,0,6,4]
5326; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
5327; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm10, %ymm1
5328; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5329; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5330; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5331; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
5332; AVX2-FCP-NEXT:    # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
5333; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm7, %ymm0
5334; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5335; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
5336; AVX2-FCP-NEXT:    # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
5337; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm8, %ymm1
5338; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
5339; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5340; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
5341; AVX2-FCP-NEXT:    # ymm6 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
5342; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm10, %ymm1
5343; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5344; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5345; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5346; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
5347; AVX2-FCP-NEXT:    # ymm0 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7]
5348; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm7, %ymm1
5349; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5350; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5351; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
5352; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm8, %ymm11
5353; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm1[3,4,5,6,7]
5354; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5355; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5356; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
5357; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm10, %ymm12
5358; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm12[5,6,7]
5359; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5360; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5361; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
5362; AVX2-FCP-NEXT:    # ymm11 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
5363; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm7, %ymm7
5364; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5365; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
5366; AVX2-FCP-NEXT:    # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7]
5367; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm8, %ymm8
5368; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7]
5369; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5370; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
5371; AVX2-FCP-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7]
5372; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm10, %ymm10
5373; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
5374; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5375; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm7 = mem[3,3,3,3]
5376; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7]
5377; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4],ymm7[5],ymm3[6,7]
5378; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm7 = [0,1,7,5,0,1,7,5]
5379; AVX2-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
5380; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm7, %ymm2
5381; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm10 = [3,1,7,5,0,u,u,u]
5382; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm10, %ymm3
5383; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
5384; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5385; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm2 = mem[3,3,3,3]
5386; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7]
5387; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1,2,3,4],ymm2[5],ymm14[6,7]
5388; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm10, %ymm2
5389; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm7, %ymm3
5390; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
5391; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5392; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm2 = mem[3,3,3,3]
5393; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
5394; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1,2,3,4],ymm2[5],ymm11[6,7]
5395; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm10, %ymm2
5396; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm7, %ymm3
5397; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
5398; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5399; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm2 = mem[3,3,3,3]
5400; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
5401; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7]
5402; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm10, %ymm0
5403; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm7, %ymm1
5404; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5405; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5406; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
5407; AVX2-FCP-NEXT:    # ymm1 = ymm4[0,1,2,3],mem[4,5,6,7]
5408; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5409; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5410; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
5411; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7]
5412; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5413; AVX2-FCP-NEXT:    vmovaps 464(%rdi), %xmm0
5414; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5415; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
5416; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
5417; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5418; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm5, %ymm1
5419; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
5420; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5421; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
5422; AVX2-FCP-NEXT:    # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
5423; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6]
5424; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
5425; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm4, %ymm1
5426; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5427; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5428; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
5429; AVX2-FCP-NEXT:    # ymm3 = ymm15[0,1,2,3],mem[4,5,6,7]
5430; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5431; AVX2-FCP-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm12 # 32-byte Folded Reload
5432; AVX2-FCP-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
5433; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm14
5434; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
5435; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
5436; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm5, %ymm1
5437; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
5438; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5439; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
5440; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
5441; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm4, %ymm8
5442; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7]
5443; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
5444; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
5445; AVX2-FCP-NEXT:    # ymm10 = ymm9[0,1,2,3],mem[4,5,6,7]
5446; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5447; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
5448; AVX2-FCP-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
5449; AVX2-FCP-NEXT:    vmovaps 272(%rdi), %xmm1
5450; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm1[2,3],ymm10[4,5,6,7]
5451; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6]
5452; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm5, %ymm9
5453; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm8[2,3,4,5,6,7]
5454; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5455; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
5456; AVX2-FCP-NEXT:    # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
5457; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm4, %ymm15
5458; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm15[5,6,7]
5459; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5460; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5461; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5462; AVX2-FCP-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
5463; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5464; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5465; AVX2-FCP-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
5466; AVX2-FCP-NEXT:    vmovaps 656(%rdi), %xmm0
5467; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm0[2,3],ymm9[4,5,6,7]
5468; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
5469; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm5, %ymm5
5470; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3,4,5,6,7]
5471; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5472; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
5473; AVX2-FCP-NEXT:    # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
5474; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
5475; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
5476; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5]
5477; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
5478; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm13, %ymm5
5479; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
5480; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,3,1,7,0,3,1,7]
5481; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
5482; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
5483; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
5484; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5485; AVX2-FCP-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
5486; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
5487; AVX2-FCP-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
5488; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload
5489; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3,4,5,6,7]
5490; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm5, %ymm6
5491; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
5492; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm10[1,1,1,1,5,5,5,5]
5493; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7]
5494; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm13, %ymm6
5495; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7]
5496; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm5, %ymm6
5497; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
5498; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5]
5499; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7]
5500; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm13, %ymm6
5501; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7]
5502; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm5
5503; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
5504; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5505; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rsi)
5506; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5507; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rsi)
5508; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5509; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rsi)
5510; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5511; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
5512; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5513; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rdx)
5514; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5515; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rdx)
5516; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5517; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rdx)
5518; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5519; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rdx)
5520; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5521; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rcx)
5522; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5523; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rcx)
5524; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5525; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rcx)
5526; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5527; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rcx)
5528; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5529; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%r8)
5530; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5531; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%r8)
5532; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5533; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%r8)
5534; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5535; AVX2-FCP-NEXT:    vmovaps %ymm5, (%r8)
5536; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%r9)
5537; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5538; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%r9)
5539; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
5540; AVX2-FCP-NEXT:    vmovaps %ymm4, (%r9)
5541; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5542; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%r9)
5543; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5544; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rax)
5545; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rax)
5546; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%rax)
5547; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
5548; AVX2-FCP-NEXT:    addq $1192, %rsp # imm = 0x4A8
5549; AVX2-FCP-NEXT:    vzeroupper
5550; AVX2-FCP-NEXT:    retq
5551;
5552; AVX512-LABEL: load_i32_stride6_vf32:
5553; AVX512:       # %bb.0:
5554; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5555; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm0
5556; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm3
5557; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm2
5558; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm5
5559; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm6
5560; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm4
5561; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm10
5562; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm9
5563; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm11
5564; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
5565; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm12
5566; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm13
5567; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
5568; AVX512-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
5569; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm15
5570; AVX512-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
5571; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0]
5572; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm8
5573; AVX512-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
5574; AVX512-NEXT:    movb $56, %dil
5575; AVX512-NEXT:    kmovw %edi, %k2
5576; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm8 {%k2}
5577; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
5578; AVX512-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
5579; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm16
5580; AVX512-NEXT:    vpermt2d %zmm10, %zmm15, %zmm16
5581; AVX512-NEXT:    movw $-2048, %di # imm = 0xF800
5582; AVX512-NEXT:    kmovw %edi, %k1
5583; AVX512-NEXT:    vmovdqa32 %zmm16, %zmm8 {%k1}
5584; AVX512-NEXT:    vpermi2d %zmm6, %zmm4, %zmm15
5585; AVX512-NEXT:    vpermi2d %zmm2, %zmm5, %zmm14
5586; AVX512-NEXT:    vpermi2d %zmm0, %zmm3, %zmm7
5587; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
5588; AVX512-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
5589; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
5590; AVX512-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5591; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm17
5592; AVX512-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
5593; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0]
5594; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm15
5595; AVX512-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
5596; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
5597; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
5598; AVX512-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
5599; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm18
5600; AVX512-NEXT:    vpermt2d %zmm10, %zmm17, %zmm18
5601; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
5602; AVX512-NEXT:    vpermi2d %zmm6, %zmm4, %zmm17
5603; AVX512-NEXT:    vpermi2d %zmm2, %zmm5, %zmm16
5604; AVX512-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
5605; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
5606; AVX512-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
5607; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0]
5608; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm19
5609; AVX512-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
5610; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
5611; AVX512-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
5612; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm16
5613; AVX512-NEXT:    vpermt2d %zmm13, %zmm17, %zmm16
5614; AVX512-NEXT:    movw $31, %di
5615; AVX512-NEXT:    kmovw %edi, %k2
5616; AVX512-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k2}
5617; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
5618; AVX512-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5619; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm20
5620; AVX512-NEXT:    vpermt2d %zmm9, %zmm19, %zmm20
5621; AVX512-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k1}
5622; AVX512-NEXT:    vpermi2d %zmm4, %zmm6, %zmm19
5623; AVX512-NEXT:    vpermi2d %zmm5, %zmm2, %zmm17
5624; AVX512-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
5625; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
5626; AVX512-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
5627; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0]
5628; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm21
5629; AVX512-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
5630; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
5631; AVX512-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5632; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm18
5633; AVX512-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
5634; AVX512-NEXT:    vmovdqa32 %zmm21, %zmm18 {%k2}
5635; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
5636; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5637; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm22
5638; AVX512-NEXT:    vpermt2d %zmm9, %zmm21, %zmm22
5639; AVX512-NEXT:    vmovdqa32 %zmm22, %zmm18 {%k1}
5640; AVX512-NEXT:    vpermi2d %zmm4, %zmm6, %zmm21
5641; AVX512-NEXT:    vpermi2d %zmm5, %zmm2, %zmm19
5642; AVX512-NEXT:    vpermi2d %zmm0, %zmm3, %zmm20
5643; AVX512-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k2}
5644; AVX512-NEXT:    vmovdqa32 %zmm21, %zmm19 {%k1}
5645; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
5646; AVX512-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
5647; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm21
5648; AVX512-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
5649; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0]
5650; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm23
5651; AVX512-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
5652; AVX512-NEXT:    movw $992, %di # imm = 0x3E0
5653; AVX512-NEXT:    kmovw %edi, %k1
5654; AVX512-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k1}
5655; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
5656; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5657; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm24
5658; AVX512-NEXT:    vpermt2d %zmm10, %zmm21, %zmm24
5659; AVX512-NEXT:    movb $-32, %dil
5660; AVX512-NEXT:    kmovw %edi, %k2
5661; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k2}
5662; AVX512-NEXT:    vpermi2d %zmm6, %zmm4, %zmm21
5663; AVX512-NEXT:    vpermi2d %zmm5, %zmm2, %zmm20
5664; AVX512-NEXT:    vpermi2d %zmm3, %zmm0, %zmm22
5665; AVX512-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k1}
5666; AVX512-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
5667; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
5668; AVX512-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
5669; AVX512-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
5670; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0]
5671; AVX512-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
5672; AVX512-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
5673; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
5674; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
5675; AVX512-NEXT:    vpermt2d %zmm10, %zmm11, %zmm9
5676; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k2}
5677; AVX512-NEXT:    vpermt2d %zmm6, %zmm11, %zmm4
5678; AVX512-NEXT:    vpermt2d %zmm5, %zmm20, %zmm2
5679; AVX512-NEXT:    vpermt2d %zmm3, %zmm13, %zmm0
5680; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
5681; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
5682; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
5683; AVX512-NEXT:    vmovdqa64 %zmm8, (%rsi)
5684; AVX512-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
5685; AVX512-NEXT:    vmovdqa64 %zmm15, (%rdx)
5686; AVX512-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
5687; AVX512-NEXT:    vmovdqa64 %zmm16, (%rcx)
5688; AVX512-NEXT:    vmovdqa64 %zmm19, 64(%r8)
5689; AVX512-NEXT:    vmovdqa64 %zmm18, (%r8)
5690; AVX512-NEXT:    vmovdqa64 %zmm22, 64(%r9)
5691; AVX512-NEXT:    vmovdqa64 %zmm23, (%r9)
5692; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
5693; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
5694; AVX512-NEXT:    vzeroupper
5695; AVX512-NEXT:    retq
5696;
5697; AVX512-FCP-LABEL: load_i32_stride6_vf32:
5698; AVX512-FCP:       # %bb.0:
5699; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5700; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
5701; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm3
5702; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm2
5703; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm5
5704; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
5705; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
5706; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm10
5707; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
5708; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm11
5709; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
5710; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm12
5711; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm13
5712; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
5713; AVX512-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
5714; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm15
5715; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
5716; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0]
5717; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm8
5718; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
5719; AVX512-FCP-NEXT:    movb $56, %dil
5720; AVX512-FCP-NEXT:    kmovw %edi, %k2
5721; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8 {%k2}
5722; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
5723; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
5724; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm16
5725; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm15, %zmm16
5726; AVX512-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
5727; AVX512-FCP-NEXT:    kmovw %edi, %k1
5728; AVX512-FCP-NEXT:    vmovdqa32 %zmm16, %zmm8 {%k1}
5729; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm15
5730; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm5, %zmm14
5731; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm7
5732; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
5733; AVX512-FCP-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
5734; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
5735; AVX512-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5736; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm17
5737; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
5738; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0]
5739; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm15
5740; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
5741; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
5742; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
5743; AVX512-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
5744; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm18
5745; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm18
5746; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
5747; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm17
5748; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm5, %zmm16
5749; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
5750; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
5751; AVX512-FCP-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
5752; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0]
5753; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
5754; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
5755; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
5756; AVX512-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
5757; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm16
5758; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm16
5759; AVX512-FCP-NEXT:    movw $31, %di
5760; AVX512-FCP-NEXT:    kmovw %edi, %k2
5761; AVX512-FCP-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k2}
5762; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
5763; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5764; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm20
5765; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm20
5766; AVX512-FCP-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k1}
5767; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm6, %zmm19
5768; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm17
5769; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
5770; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
5771; AVX512-FCP-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
5772; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0]
5773; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm21
5774; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
5775; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
5776; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5777; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm18
5778; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
5779; AVX512-FCP-NEXT:    vmovdqa32 %zmm21, %zmm18 {%k2}
5780; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
5781; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5782; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm22
5783; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm21, %zmm22
5784; AVX512-FCP-NEXT:    vmovdqa32 %zmm22, %zmm18 {%k1}
5785; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm6, %zmm21
5786; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm19
5787; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm20
5788; AVX512-FCP-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k2}
5789; AVX512-FCP-NEXT:    vmovdqa32 %zmm21, %zmm19 {%k1}
5790; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
5791; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
5792; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm21
5793; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
5794; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0]
5795; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23
5796; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
5797; AVX512-FCP-NEXT:    movw $992, %di # imm = 0x3E0
5798; AVX512-FCP-NEXT:    kmovw %edi, %k1
5799; AVX512-FCP-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k1}
5800; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
5801; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5802; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
5803; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm21, %zmm24
5804; AVX512-FCP-NEXT:    movb $-32, %dil
5805; AVX512-FCP-NEXT:    kmovw %edi, %k2
5806; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k2}
5807; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm21
5808; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm20
5809; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm0, %zmm22
5810; AVX512-FCP-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k1}
5811; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
5812; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
5813; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
5814; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
5815; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0]
5816; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
5817; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
5818; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
5819; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
5820; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm11, %zmm9
5821; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k2}
5822; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm11, %zmm4
5823; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm20, %zmm2
5824; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm13, %zmm0
5825; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
5826; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
5827; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
5828; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
5829; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
5830; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rdx)
5831; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
5832; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
5833; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 64(%r8)
5834; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, (%r8)
5835; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r9)
5836; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%r9)
5837; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
5838; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
5839; AVX512-FCP-NEXT:    vzeroupper
5840; AVX512-FCP-NEXT:    retq
5841;
5842; AVX512DQ-LABEL: load_i32_stride6_vf32:
5843; AVX512DQ:       # %bb.0:
5844; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5845; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm0
5846; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm3
5847; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm2
5848; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm5
5849; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm6
5850; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm4
5851; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm10
5852; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm9
5853; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm11
5854; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm1
5855; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm12
5856; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm13
5857; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
5858; AVX512DQ-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
5859; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm15
5860; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
5861; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0]
5862; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm8
5863; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
5864; AVX512DQ-NEXT:    movb $56, %dil
5865; AVX512DQ-NEXT:    kmovw %edi, %k2
5866; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm8 {%k2}
5867; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
5868; AVX512DQ-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
5869; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm16
5870; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm15, %zmm16
5871; AVX512DQ-NEXT:    movw $-2048, %di # imm = 0xF800
5872; AVX512DQ-NEXT:    kmovw %edi, %k1
5873; AVX512DQ-NEXT:    vmovdqa32 %zmm16, %zmm8 {%k1}
5874; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm4, %zmm15
5875; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm5, %zmm14
5876; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm3, %zmm7
5877; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
5878; AVX512DQ-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
5879; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
5880; AVX512DQ-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5881; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm17
5882; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
5883; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0]
5884; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm15
5885; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
5886; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
5887; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
5888; AVX512DQ-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
5889; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm18
5890; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm17, %zmm18
5891; AVX512DQ-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
5892; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm4, %zmm17
5893; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm5, %zmm16
5894; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
5895; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
5896; AVX512DQ-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
5897; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0]
5898; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm19
5899; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
5900; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
5901; AVX512DQ-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
5902; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm16
5903; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm17, %zmm16
5904; AVX512DQ-NEXT:    movw $31, %di
5905; AVX512DQ-NEXT:    kmovw %edi, %k2
5906; AVX512DQ-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k2}
5907; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
5908; AVX512DQ-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5909; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm20
5910; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm19, %zmm20
5911; AVX512DQ-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k1}
5912; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm6, %zmm19
5913; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm2, %zmm17
5914; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
5915; AVX512DQ-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
5916; AVX512DQ-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
5917; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0]
5918; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm21
5919; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
5920; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
5921; AVX512DQ-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5922; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm18
5923; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
5924; AVX512DQ-NEXT:    vmovdqa32 %zmm21, %zmm18 {%k2}
5925; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
5926; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5927; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm22
5928; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm21, %zmm22
5929; AVX512DQ-NEXT:    vmovdqa32 %zmm22, %zmm18 {%k1}
5930; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm6, %zmm21
5931; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm2, %zmm19
5932; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm3, %zmm20
5933; AVX512DQ-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k2}
5934; AVX512DQ-NEXT:    vmovdqa32 %zmm21, %zmm19 {%k1}
5935; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
5936; AVX512DQ-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
5937; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm21
5938; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
5939; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0]
5940; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm23
5941; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
5942; AVX512DQ-NEXT:    movw $992, %di # imm = 0x3E0
5943; AVX512DQ-NEXT:    kmovw %edi, %k1
5944; AVX512DQ-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k1}
5945; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
5946; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5947; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm24
5948; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm21, %zmm24
5949; AVX512DQ-NEXT:    movb $-32, %dil
5950; AVX512DQ-NEXT:    kmovw %edi, %k2
5951; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k2}
5952; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm4, %zmm21
5953; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm2, %zmm20
5954; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm0, %zmm22
5955; AVX512DQ-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k1}
5956; AVX512DQ-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
5957; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
5958; AVX512DQ-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
5959; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
5960; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0]
5961; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
5962; AVX512DQ-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
5963; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
5964; AVX512DQ-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
5965; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm11, %zmm9
5966; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k2}
5967; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm11, %zmm4
5968; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm20, %zmm2
5969; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm13, %zmm0
5970; AVX512DQ-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
5971; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
5972; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
5973; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rsi)
5974; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
5975; AVX512DQ-NEXT:    vmovdqa64 %zmm15, (%rdx)
5976; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
5977; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%rcx)
5978; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 64(%r8)
5979; AVX512DQ-NEXT:    vmovdqa64 %zmm18, (%r8)
5980; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 64(%r9)
5981; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%r9)
5982; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rax)
5983; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
5984; AVX512DQ-NEXT:    vzeroupper
5985; AVX512DQ-NEXT:    retq
5986;
5987; AVX512DQ-FCP-LABEL: load_i32_stride6_vf32:
5988; AVX512DQ-FCP:       # %bb.0:
5989; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5990; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
5991; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm3
5992; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm2
5993; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm5
5994; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
5995; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
5996; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm10
5997; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
5998; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm11
5999; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
6000; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm12
6001; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm13
6002; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
6003; AVX512DQ-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
6004; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm15
6005; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
6006; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0]
6007; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm8
6008; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
6009; AVX512DQ-FCP-NEXT:    movb $56, %dil
6010; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
6011; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8 {%k2}
6012; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
6013; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
6014; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm16
6015; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm15, %zmm16
6016; AVX512DQ-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
6017; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
6018; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm16, %zmm8 {%k1}
6019; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm15
6020; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm5, %zmm14
6021; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm7
6022; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
6023; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
6024; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
6025; AVX512DQ-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
6026; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm17
6027; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
6028; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0]
6029; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm15
6030; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
6031; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
6032; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
6033; AVX512DQ-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6034; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm18
6035; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm18
6036; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
6037; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm17
6038; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm5, %zmm16
6039; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
6040; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
6041; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
6042; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0]
6043; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
6044; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
6045; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
6046; AVX512DQ-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6047; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm16
6048; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm16
6049; AVX512DQ-FCP-NEXT:    movw $31, %di
6050; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
6051; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k2}
6052; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
6053; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6054; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm20
6055; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm20
6056; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k1}
6057; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm6, %zmm19
6058; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm17
6059; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
6060; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
6061; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
6062; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0]
6063; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm21
6064; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
6065; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
6066; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6067; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm18
6068; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
6069; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm21, %zmm18 {%k2}
6070; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
6071; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6072; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm22
6073; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm21, %zmm22
6074; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm22, %zmm18 {%k1}
6075; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm6, %zmm21
6076; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm19
6077; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm20
6078; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k2}
6079; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm21, %zmm19 {%k1}
6080; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
6081; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6082; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm21
6083; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
6084; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0]
6085; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23
6086; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
6087; AVX512DQ-FCP-NEXT:    movw $992, %di # imm = 0x3E0
6088; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
6089; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k1}
6090; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
6091; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6092; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
6093; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm21, %zmm24
6094; AVX512DQ-FCP-NEXT:    movb $-32, %dil
6095; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
6096; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k2}
6097; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm21
6098; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm20
6099; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm0, %zmm22
6100; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k1}
6101; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
6102; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
6103; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6104; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
6105; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0]
6106; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
6107; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
6108; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
6109; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
6110; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm11, %zmm9
6111; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k2}
6112; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm11, %zmm4
6113; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm20, %zmm2
6114; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm13, %zmm0
6115; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
6116; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
6117; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
6118; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
6119; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
6120; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, (%rdx)
6121; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
6122; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
6123; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 64(%r8)
6124; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%r8)
6125; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r9)
6126; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%r9)
6127; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
6128; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
6129; AVX512DQ-FCP-NEXT:    vzeroupper
6130; AVX512DQ-FCP-NEXT:    retq
6131;
6132; AVX512BW-LABEL: load_i32_stride6_vf32:
6133; AVX512BW:       # %bb.0:
6134; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6135; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm0
6136; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm3
6137; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
6138; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm5
6139; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
6140; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
6141; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm10
6142; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
6143; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm11
6144; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
6145; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm12
6146; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm13
6147; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
6148; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
6149; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
6150; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
6151; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0]
6152; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm8
6153; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
6154; AVX512BW-NEXT:    movb $56, %dil
6155; AVX512BW-NEXT:    kmovd %edi, %k2
6156; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8 {%k2}
6157; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
6158; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
6159; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm16
6160; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm15, %zmm16
6161; AVX512BW-NEXT:    movw $-2048, %di # imm = 0xF800
6162; AVX512BW-NEXT:    kmovd %edi, %k1
6163; AVX512BW-NEXT:    vmovdqa32 %zmm16, %zmm8 {%k1}
6164; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm15
6165; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm5, %zmm14
6166; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm7
6167; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
6168; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
6169; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
6170; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
6171; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm17
6172; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
6173; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0]
6174; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm15
6175; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
6176; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
6177; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
6178; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6179; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm18
6180; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm17, %zmm18
6181; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
6182; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm17
6183; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm5, %zmm16
6184; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
6185; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
6186; AVX512BW-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
6187; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0]
6188; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm19
6189; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
6190; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
6191; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6192; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm16
6193; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm17, %zmm16
6194; AVX512BW-NEXT:    movw $31, %di
6195; AVX512BW-NEXT:    kmovd %edi, %k2
6196; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k2}
6197; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
6198; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6199; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm20
6200; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm19, %zmm20
6201; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k1}
6202; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm6, %zmm19
6203; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm2, %zmm17
6204; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
6205; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
6206; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
6207; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0]
6208; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm21
6209; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
6210; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
6211; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6212; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm18
6213; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
6214; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm18 {%k2}
6215; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
6216; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6217; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm22
6218; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm21, %zmm22
6219; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm18 {%k1}
6220; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm6, %zmm21
6221; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm2, %zmm19
6222; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm20
6223; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k2}
6224; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm19 {%k1}
6225; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
6226; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6227; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm21
6228; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
6229; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0]
6230; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23
6231; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
6232; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
6233; AVX512BW-NEXT:    kmovd %edi, %k1
6234; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k1}
6235; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
6236; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6237; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm24
6238; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm21, %zmm24
6239; AVX512BW-NEXT:    movb $-32, %dil
6240; AVX512BW-NEXT:    kmovd %edi, %k2
6241; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k2}
6242; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm21
6243; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm2, %zmm20
6244; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm0, %zmm22
6245; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k1}
6246; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
6247; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
6248; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6249; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
6250; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0]
6251; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
6252; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
6253; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
6254; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
6255; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm11, %zmm9
6256; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k2}
6257; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm11, %zmm4
6258; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm20, %zmm2
6259; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm13, %zmm0
6260; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
6261; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
6262; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
6263; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
6264; AVX512BW-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
6265; AVX512BW-NEXT:    vmovdqa64 %zmm15, (%rdx)
6266; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
6267; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rcx)
6268; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%r8)
6269; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%r8)
6270; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r9)
6271; AVX512BW-NEXT:    vmovdqa64 %zmm23, (%r9)
6272; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
6273; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
6274; AVX512BW-NEXT:    vzeroupper
6275; AVX512BW-NEXT:    retq
6276;
6277; AVX512BW-FCP-LABEL: load_i32_stride6_vf32:
6278; AVX512BW-FCP:       # %bb.0:
6279; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6280; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
6281; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm3
6282; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm2
6283; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm5
6284; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
6285; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
6286; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm10
6287; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
6288; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm11
6289; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
6290; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm12
6291; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm13
6292; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
6293; AVX512BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
6294; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm15
6295; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
6296; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0]
6297; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm8
6298; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
6299; AVX512BW-FCP-NEXT:    movb $56, %dil
6300; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
6301; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8 {%k2}
6302; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
6303; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
6304; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm16
6305; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm15, %zmm16
6306; AVX512BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
6307; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
6308; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm8 {%k1}
6309; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm15
6310; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm5, %zmm14
6311; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm7
6312; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
6313; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
6314; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
6315; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
6316; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm17
6317; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
6318; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0]
6319; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm15
6320; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
6321; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
6322; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
6323; AVX512BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6324; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm18
6325; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm18
6326; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
6327; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm17
6328; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm5, %zmm16
6329; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
6330; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
6331; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
6332; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0]
6333; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
6334; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
6335; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
6336; AVX512BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6337; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm16
6338; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm16
6339; AVX512BW-FCP-NEXT:    movw $31, %di
6340; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
6341; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k2}
6342; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
6343; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6344; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm20
6345; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm20
6346; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k1}
6347; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm6, %zmm19
6348; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm17
6349; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
6350; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
6351; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
6352; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0]
6353; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm21
6354; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
6355; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
6356; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6357; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm18
6358; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
6359; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm18 {%k2}
6360; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
6361; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6362; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm22
6363; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm21, %zmm22
6364; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm18 {%k1}
6365; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm6, %zmm21
6366; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm19
6367; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm20
6368; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k2}
6369; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm19 {%k1}
6370; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
6371; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6372; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm21
6373; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
6374; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0]
6375; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23
6376; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
6377; AVX512BW-FCP-NEXT:    movw $992, %di # imm = 0x3E0
6378; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
6379; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k1}
6380; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
6381; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6382; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
6383; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm21, %zmm24
6384; AVX512BW-FCP-NEXT:    movb $-32, %dil
6385; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
6386; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k2}
6387; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm21
6388; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm20
6389; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm0, %zmm22
6390; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k1}
6391; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
6392; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
6393; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6394; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
6395; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0]
6396; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
6397; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
6398; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
6399; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
6400; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm11, %zmm9
6401; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k2}
6402; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm11, %zmm4
6403; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm20, %zmm2
6404; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm13, %zmm0
6405; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
6406; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
6407; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
6408; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
6409; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
6410; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rdx)
6411; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
6412; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
6413; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 64(%r8)
6414; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, (%r8)
6415; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r9)
6416; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r9)
6417; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
6418; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
6419; AVX512BW-FCP-NEXT:    vzeroupper
6420; AVX512BW-FCP-NEXT:    retq
6421;
6422; AVX512DQ-BW-LABEL: load_i32_stride6_vf32:
6423; AVX512DQ-BW:       # %bb.0:
6424; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6425; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm0
6426; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm3
6427; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
6428; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm5
6429; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
6430; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
6431; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm10
6432; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
6433; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm11
6434; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
6435; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm12
6436; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm13
6437; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
6438; AVX512DQ-BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
6439; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm15
6440; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
6441; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0]
6442; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm8
6443; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
6444; AVX512DQ-BW-NEXT:    movb $56, %dil
6445; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
6446; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm8 {%k2}
6447; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
6448; AVX512DQ-BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
6449; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm16
6450; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm15, %zmm16
6451; AVX512DQ-BW-NEXT:    movw $-2048, %di # imm = 0xF800
6452; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
6453; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm16, %zmm8 {%k1}
6454; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm15
6455; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm5, %zmm14
6456; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm7
6457; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
6458; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
6459; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
6460; AVX512DQ-BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
6461; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm17
6462; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
6463; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0]
6464; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm15
6465; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
6466; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
6467; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
6468; AVX512DQ-BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6469; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm18
6470; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm17, %zmm18
6471; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
6472; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm17
6473; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm5, %zmm16
6474; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
6475; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
6476; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
6477; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0]
6478; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm19
6479; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
6480; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
6481; AVX512DQ-BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6482; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm16
6483; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm17, %zmm16
6484; AVX512DQ-BW-NEXT:    movw $31, %di
6485; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
6486; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k2}
6487; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
6488; AVX512DQ-BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6489; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm20
6490; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm19, %zmm20
6491; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k1}
6492; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm6, %zmm19
6493; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm2, %zmm17
6494; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
6495; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
6496; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
6497; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0]
6498; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm21
6499; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
6500; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
6501; AVX512DQ-BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6502; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm18
6503; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
6504; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm18 {%k2}
6505; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
6506; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6507; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm22
6508; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm21, %zmm22
6509; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm18 {%k1}
6510; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm6, %zmm21
6511; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm2, %zmm19
6512; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm20
6513; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k2}
6514; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm19 {%k1}
6515; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
6516; AVX512DQ-BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6517; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm21
6518; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
6519; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0]
6520; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm23
6521; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
6522; AVX512DQ-BW-NEXT:    movw $992, %di # imm = 0x3E0
6523; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
6524; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k1}
6525; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
6526; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6527; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm24
6528; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm21, %zmm24
6529; AVX512DQ-BW-NEXT:    movb $-32, %dil
6530; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
6531; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k2}
6532; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm21
6533; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm2, %zmm20
6534; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm0, %zmm22
6535; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k1}
6536; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
6537; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
6538; AVX512DQ-BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6539; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
6540; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0]
6541; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
6542; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
6543; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
6544; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
6545; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm11, %zmm9
6546; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k2}
6547; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm11, %zmm4
6548; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm20, %zmm2
6549; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm13, %zmm0
6550; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
6551; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
6552; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
6553; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
6554; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
6555; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, (%rdx)
6556; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
6557; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, (%rcx)
6558; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, 64(%r8)
6559; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, (%r8)
6560; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, 64(%r9)
6561; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, (%r9)
6562; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
6563; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%rax)
6564; AVX512DQ-BW-NEXT:    vzeroupper
6565; AVX512DQ-BW-NEXT:    retq
6566;
6567; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf32:
6568; AVX512DQ-BW-FCP:       # %bb.0:
6569; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6570; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
6571; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm3
6572; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm2
6573; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm5
6574; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
6575; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
6576; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm10
6577; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
6578; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm11
6579; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
6580; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm12
6581; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm13
6582; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
6583; AVX512DQ-BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
6584; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm15
6585; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
6586; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0]
6587; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm8
6588; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
6589; AVX512DQ-BW-FCP-NEXT:    movb $56, %dil
6590; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
6591; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8 {%k2}
6592; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
6593; AVX512DQ-BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
6594; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm16
6595; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm15, %zmm16
6596; AVX512DQ-BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
6597; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
6598; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm8 {%k1}
6599; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm15
6600; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm5, %zmm14
6601; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm7
6602; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
6603; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
6604; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
6605; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
6606; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm17
6607; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
6608; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0]
6609; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm15
6610; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
6611; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
6612; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
6613; AVX512DQ-BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6614; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm18
6615; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm18
6616; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
6617; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm17
6618; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm5, %zmm16
6619; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
6620; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
6621; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
6622; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0]
6623; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
6624; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
6625; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
6626; AVX512DQ-BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
6627; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm16
6628; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm16
6629; AVX512DQ-BW-FCP-NEXT:    movw $31, %di
6630; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
6631; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k2}
6632; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
6633; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6634; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm20
6635; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm20
6636; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k1}
6637; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm6, %zmm19
6638; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm17
6639; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
6640; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
6641; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
6642; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0]
6643; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm21
6644; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
6645; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
6646; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6647; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm18
6648; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
6649; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm18 {%k2}
6650; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
6651; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6652; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm22
6653; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm21, %zmm22
6654; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm18 {%k1}
6655; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm6, %zmm21
6656; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm19
6657; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm3, %zmm20
6658; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k2}
6659; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm19 {%k1}
6660; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
6661; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6662; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm21
6663; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
6664; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0]
6665; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23
6666; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
6667; AVX512DQ-BW-FCP-NEXT:    movw $992, %di # imm = 0x3E0
6668; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
6669; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k1}
6670; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
6671; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6672; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
6673; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm21, %zmm24
6674; AVX512DQ-BW-FCP-NEXT:    movb $-32, %dil
6675; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
6676; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k2}
6677; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm4, %zmm21
6678; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm20
6679; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm0, %zmm22
6680; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k1}
6681; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
6682; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
6683; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
6684; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
6685; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0]
6686; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
6687; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
6688; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
6689; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
6690; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm11, %zmm9
6691; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k2}
6692; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm11, %zmm4
6693; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm20, %zmm2
6694; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm13, %zmm0
6695; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
6696; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
6697; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
6698; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
6699; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
6700; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rdx)
6701; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
6702; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
6703; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 64(%r8)
6704; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, (%r8)
6705; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r9)
6706; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r9)
6707; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
6708; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
6709; AVX512DQ-BW-FCP-NEXT:    vzeroupper
6710; AVX512DQ-BW-FCP-NEXT:    retq
6711  %wide.vec = load <192 x i32>, ptr %in.vec, align 64
6712  %strided.vec0 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186>
6713  %strided.vec1 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187>
6714  %strided.vec2 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188>
6715  %strided.vec3 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189>
6716  %strided.vec4 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190>
6717  %strided.vec5 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191>
6718  store <32 x i32> %strided.vec0, ptr %out.vec0, align 64
6719  store <32 x i32> %strided.vec1, ptr %out.vec1, align 64
6720  store <32 x i32> %strided.vec2, ptr %out.vec2, align 64
6721  store <32 x i32> %strided.vec3, ptr %out.vec3, align 64
6722  store <32 x i32> %strided.vec4, ptr %out.vec4, align 64
6723  store <32 x i32> %strided.vec5, ptr %out.vec5, align 64
6724  ret void
6725}
6726
6727define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
6728; SSE-LABEL: load_i32_stride6_vf64:
6729; SSE:       # %bb.0:
6730; SSE-NEXT:    subq $2184, %rsp # imm = 0x888
6731; SSE-NEXT:    movdqa 912(%rdi), %xmm7
6732; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6733; SSE-NEXT:    movdqa 928(%rdi), %xmm3
6734; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6735; SSE-NEXT:    movdqa 864(%rdi), %xmm8
6736; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6737; SSE-NEXT:    movdqa 880(%rdi), %xmm4
6738; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6739; SSE-NEXT:    movdqa 528(%rdi), %xmm9
6740; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6741; SSE-NEXT:    movdqa 544(%rdi), %xmm5
6742; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6743; SSE-NEXT:    movdqa 480(%rdi), %xmm10
6744; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6745; SSE-NEXT:    movdqa 496(%rdi), %xmm6
6746; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6747; SSE-NEXT:    movdqa 144(%rdi), %xmm11
6748; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6749; SSE-NEXT:    movdqa 160(%rdi), %xmm2
6750; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6751; SSE-NEXT:    movdqa 96(%rdi), %xmm1
6752; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
6753; SSE-NEXT:    movdqa 112(%rdi), %xmm0
6754; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6755; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6756; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6757; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
6758; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1]
6759; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
6760; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
6761; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6762; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
6763; SSE-NEXT:    movdqa %xmm10, %xmm1
6764; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6765; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
6766; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1]
6767; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
6768; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
6769; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6770; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
6771; SSE-NEXT:    movdqa %xmm8, %xmm1
6772; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6773; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
6774; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1]
6775; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
6776; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
6777; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6778; SSE-NEXT:    movdqa 1248(%rdi), %xmm1
6779; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6780; SSE-NEXT:    movdqa 1264(%rdi), %xmm0
6781; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6782; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6783; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6784; SSE-NEXT:    movdqa 1296(%rdi), %xmm3
6785; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6786; SSE-NEXT:    movdqa 1312(%rdi), %xmm0
6787; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6788; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6789; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
6790; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
6791; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
6792; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6793; SSE-NEXT:    movdqa (%rdi), %xmm1
6794; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6795; SSE-NEXT:    movdqa 16(%rdi), %xmm0
6796; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6797; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6798; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6799; SSE-NEXT:    movdqa 64(%rdi), %xmm0
6800; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6801; SSE-NEXT:    movdqa 48(%rdi), %xmm2
6802; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6803; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6804; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
6805; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
6806; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
6807; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6808; SSE-NEXT:    movdqa 384(%rdi), %xmm1
6809; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6810; SSE-NEXT:    movdqa 400(%rdi), %xmm0
6811; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6812; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6813; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6814; SSE-NEXT:    movdqa 432(%rdi), %xmm2
6815; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6816; SSE-NEXT:    movdqa 448(%rdi), %xmm0
6817; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6818; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6819; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
6820; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
6821; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
6822; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6823; SSE-NEXT:    movdqa 768(%rdi), %xmm1
6824; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6825; SSE-NEXT:    movdqa 784(%rdi), %xmm0
6826; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6827; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6828; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6829; SSE-NEXT:    movdqa 816(%rdi), %xmm2
6830; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6831; SSE-NEXT:    movdqa 832(%rdi), %xmm0
6832; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6833; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6834; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
6835; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
6836; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
6837; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6838; SSE-NEXT:    movdqa 1152(%rdi), %xmm1
6839; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6840; SSE-NEXT:    movdqa 1168(%rdi), %xmm0
6841; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6842; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6843; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6844; SSE-NEXT:    movdqa 1200(%rdi), %xmm2
6845; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6846; SSE-NEXT:    movdqa 1216(%rdi), %xmm0
6847; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6848; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6849; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
6850; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
6851; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
6852; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6853; SSE-NEXT:    movdqa 288(%rdi), %xmm2
6854; SSE-NEXT:    movdqa 304(%rdi), %xmm0
6855; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6856; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6857; SSE-NEXT:    movdqa %xmm2, %xmm1
6858; SSE-NEXT:    movdqa %xmm2, %xmm15
6859; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6860; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6861; SSE-NEXT:    movdqa 336(%rdi), %xmm7
6862; SSE-NEXT:    movdqa 352(%rdi), %xmm0
6863; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6864; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6865; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1]
6866; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6867; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
6868; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
6869; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6870; SSE-NEXT:    movdqa 672(%rdi), %xmm1
6871; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6872; SSE-NEXT:    movdqa 688(%rdi), %xmm0
6873; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6874; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6875; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6876; SSE-NEXT:    movdqa 720(%rdi), %xmm3
6877; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6878; SSE-NEXT:    movdqa 736(%rdi), %xmm0
6879; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6880; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6881; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
6882; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
6883; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
6884; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6885; SSE-NEXT:    movdqa 1056(%rdi), %xmm1
6886; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6887; SSE-NEXT:    movdqa 1072(%rdi), %xmm0
6888; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6889; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6890; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6891; SSE-NEXT:    movdqa 1104(%rdi), %xmm2
6892; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6893; SSE-NEXT:    movdqa 1120(%rdi), %xmm0
6894; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6895; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6896; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
6897; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
6898; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
6899; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6900; SSE-NEXT:    movdqa 1440(%rdi), %xmm1
6901; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6902; SSE-NEXT:    movdqa 1456(%rdi), %xmm0
6903; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6904; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6905; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6906; SSE-NEXT:    movdqa 1488(%rdi), %xmm2
6907; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6908; SSE-NEXT:    movdqa 1504(%rdi), %xmm0
6909; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6910; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6911; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
6912; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
6913; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
6914; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6915; SSE-NEXT:    movdqa 192(%rdi), %xmm5
6916; SSE-NEXT:    movdqa 208(%rdi), %xmm6
6917; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3]
6918; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6919; SSE-NEXT:    movdqa %xmm5, %xmm3
6920; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6921; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
6922; SSE-NEXT:    movdqa 240(%rdi), %xmm2
6923; SSE-NEXT:    movdqa 256(%rdi), %xmm1
6924; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3]
6925; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6926; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[0,0,1,1]
6927; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6928; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
6929; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1]
6930; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6931; SSE-NEXT:    movdqa 576(%rdi), %xmm10
6932; SSE-NEXT:    movdqa 592(%rdi), %xmm14
6933; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3]
6934; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6935; SSE-NEXT:    movdqa %xmm10, %xmm4
6936; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6937; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6938; SSE-NEXT:    movdqa 624(%rdi), %xmm11
6939; SSE-NEXT:    movdqa 640(%rdi), %xmm13
6940; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm13[2,2,3,3]
6941; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6942; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1]
6943; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6944; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
6945; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1]
6946; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6947; SSE-NEXT:    movdqa 960(%rdi), %xmm4
6948; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6949; SSE-NEXT:    movdqa 976(%rdi), %xmm0
6950; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6951; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
6952; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6953; SSE-NEXT:    movdqa 1008(%rdi), %xmm8
6954; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6955; SSE-NEXT:    movdqa 1024(%rdi), %xmm0
6956; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
6957; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6958; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1]
6959; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
6960; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1]
6961; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6962; SSE-NEXT:    movdqa 1344(%rdi), %xmm4
6963; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6964; SSE-NEXT:    movdqa 1360(%rdi), %xmm3
6965; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6966; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
6967; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6968; SSE-NEXT:    movdqa 1392(%rdi), %xmm9
6969; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6970; SSE-NEXT:    movdqa 1408(%rdi), %xmm3
6971; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6972; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
6973; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm9[0,0,1,1]
6974; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
6975; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1]
6976; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6977; SSE-NEXT:    pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload
6978; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
6979; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
6980; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
6981; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
6982; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
6983; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
6984; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6985; SSE-NEXT:    movdqa %xmm9, %xmm12
6986; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
6987; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1]
6988; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6989; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
6990; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
6991; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
6992; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
6993; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
6994; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
6995; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
6996; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6997; SSE-NEXT:    movdqa %xmm8, %xmm12
6998; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
6999; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1]
7000; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7001; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
7002; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7003; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7004; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7005; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7006; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7007; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
7008; SSE-NEXT:    movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1]
7009; SSE-NEXT:    movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7010; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
7011; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3]
7012; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7013; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
7014; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
7015; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
7016; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7017; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7018; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
7019; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7020; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7021; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7022; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7023; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7024; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7025; SSE-NEXT:    movdqa %xmm12, %xmm7
7026; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
7027; SSE-NEXT:    movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1]
7028; SSE-NEXT:    movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7029; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7030; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1]
7031; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7032; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7033; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7034; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7035; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7036; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7037; SSE-NEXT:    movdqa %xmm2, %xmm5
7038; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
7039; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
7040; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7041; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7042; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7043; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7044; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7045; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7046; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7047; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7048; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7049; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
7050; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
7051; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7052; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1]
7053; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3]
7054; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7055; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[2,3,2,3]
7056; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
7057; SSE-NEXT:    movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1]
7058; SSE-NEXT:    movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7059; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7060; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7061; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7062; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7063; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7064; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7065; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7066; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7067; SSE-NEXT:    movdqa %xmm11, %xmm1
7068; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
7069; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
7070; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7071; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7072; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
7073; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7074; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7075; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7076; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7077; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7078; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7079; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
7080; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
7081; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7082; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7083; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7084; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7085; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7086; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7087; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7088; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7089; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7090; SSE-NEXT:    movdqa %xmm13, %xmm1
7091; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
7092; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
7093; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7094; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7095; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7096; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7097; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7098; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7099; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
7100; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7101; SSE-NEXT:    movdqa %xmm14, %xmm0
7102; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
7103; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7104; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7105; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7106; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7107; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7108; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7109; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7110; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7111; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7112; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7113; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
7114; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7115; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7116; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7117; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1]
7118; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7119; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7120; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7121; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7122; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7123; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7124; SSE-NEXT:    movdqa %xmm5, %xmm0
7125; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
7126; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7127; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7128; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7129; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7130; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7131; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7132; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7133; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7134; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7135; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7136; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
7137; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7138; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7139; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7140; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7141; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7142; SSE-NEXT:    # xmm4 = mem[3,3,3,3]
7143; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7144; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7145; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7146; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7147; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
7148; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7149; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7150; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
7151; SSE-NEXT:    movdqa 80(%rdi), %xmm1
7152; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7153; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7154; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7155; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7156; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7157; SSE-NEXT:    movdqa 32(%rdi), %xmm1
7158; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7159; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7160; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7161; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7162; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
7163; SSE-NEXT:    movdqa 176(%rdi), %xmm1
7164; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7165; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7166; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7167; SSE-NEXT:    pshufd $238, (%rsp), %xmm3 # 16-byte Folded Reload
7168; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7169; SSE-NEXT:    movdqa 128(%rdi), %xmm1
7170; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7171; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7172; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7173; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7174; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7175; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7176; SSE-NEXT:    movdqa 272(%rdi), %xmm1
7177; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7178; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7179; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7180; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7181; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7182; SSE-NEXT:    movdqa 224(%rdi), %xmm1
7183; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7184; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7185; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7186; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7187; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7188; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7189; SSE-NEXT:    movdqa 368(%rdi), %xmm1
7190; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7191; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7192; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7193; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7194; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7195; SSE-NEXT:    movdqa 320(%rdi), %xmm1
7196; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7197; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7198; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7199; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7200; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
7201; SSE-NEXT:    movdqa 464(%rdi), %xmm1
7202; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7203; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7204; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7205; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3]
7206; SSE-NEXT:    movdqa 416(%rdi), %xmm1
7207; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7208; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7209; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7210; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7211; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
7212; SSE-NEXT:    movdqa 560(%rdi), %xmm1
7213; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7214; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7215; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7216; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3]
7217; SSE-NEXT:    movdqa 512(%rdi), %xmm2
7218; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7219; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7220; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7221; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7222; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7223; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7224; SSE-NEXT:    movdqa 656(%rdi), %xmm1
7225; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7226; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7227; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7228; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7229; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7230; SSE-NEXT:    movdqa 608(%rdi), %xmm9
7231; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
7232; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7233; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7234; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7235; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7236; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7237; SSE-NEXT:    movdqa 752(%rdi), %xmm1
7238; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7239; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7240; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7241; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7242; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3]
7243; SSE-NEXT:    movdqa 704(%rdi), %xmm2
7244; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7245; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7246; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7247; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7248; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7249; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7250; SSE-NEXT:    movdqa 848(%rdi), %xmm1
7251; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7252; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7253; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7254; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
7255; SSE-NEXT:    movdqa 800(%rdi), %xmm2
7256; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7257; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7258; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7259; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7260; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
7261; SSE-NEXT:    movdqa 944(%rdi), %xmm1
7262; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7263; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7264; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7265; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7266; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7267; SSE-NEXT:    movdqa 896(%rdi), %xmm8
7268; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
7269; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7270; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7271; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7272; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3]
7273; SSE-NEXT:    movdqa 1040(%rdi), %xmm1
7274; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7275; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7276; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7277; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7278; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3]
7279; SSE-NEXT:    movdqa 992(%rdi), %xmm1
7280; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7281; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7282; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7283; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7284; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
7285; SSE-NEXT:    movdqa 1136(%rdi), %xmm1
7286; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7287; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7288; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7289; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7290; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7291; SSE-NEXT:    movdqa 1088(%rdi), %xmm6
7292; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
7293; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7294; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7295; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7296; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
7297; SSE-NEXT:    movdqa 1232(%rdi), %xmm2
7298; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7299; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
7300; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7301; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3]
7302; SSE-NEXT:    movdqa 1184(%rdi), %xmm1
7303; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7304; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7305; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7306; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7307; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7308; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7309; SSE-NEXT:    movdqa 1328(%rdi), %xmm1
7310; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7311; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7312; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7313; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7314; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7315; SSE-NEXT:    movdqa 1280(%rdi), %xmm1
7316; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7317; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7318; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7319; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7320; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7321; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7322; SSE-NEXT:    movdqa 1424(%rdi), %xmm1
7323; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7324; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7325; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7326; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7327; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7328; SSE-NEXT:    movdqa 1376(%rdi), %xmm1
7329; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7330; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7331; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7332; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7333; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7334; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7335; SSE-NEXT:    movdqa 1520(%rdi), %xmm1
7336; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7337; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
7338; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7339; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7340; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7341; SSE-NEXT:    movdqa 1472(%rdi), %xmm1
7342; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7343; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7344; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7345; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7346; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7347; SSE-NEXT:    # xmm3 = mem[3,3,3,3]
7348; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7349; SSE-NEXT:    # xmm4 = mem[1,1,1,1]
7350; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7351; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7352; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7353; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7354; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
7355; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
7356; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7357; SSE-NEXT:    pshufd $255, (%rsp), %xmm2 # 16-byte Folded Reload
7358; SSE-NEXT:    # xmm2 = mem[3,3,3,3]
7359; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7360; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7361; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
7362; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7363; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7364; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7365; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
7366; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
7367; SSE-NEXT:    movapd %xmm0, (%rsp) # 16-byte Spill
7368; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7369; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7370; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7371; SSE-NEXT:    # xmm2 = mem[1,1,1,1]
7372; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7373; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7374; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7375; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7376; SSE-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
7377; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7378; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7379; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7380; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7381; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7382; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
7383; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7384; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7385; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7386; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7387; SSE-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
7388; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7389; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7390; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7391; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7392; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7393; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
7394; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7395; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7396; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7397; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7398; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
7399; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7400; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7401; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7402; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7403; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7404; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
7405; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7406; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7407; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7408; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7409; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
7410; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7411; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7412; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7413; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7414; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
7415; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7416; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7417; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7418; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7419; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
7420; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7421; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7422; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3]
7423; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7424; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
7425; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7426; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7427; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7428; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7429; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
7430; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7431; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7432; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7433; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7434; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7435; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
7436; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7437; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7438; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7439; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7440; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
7441; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7442; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7443; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7444; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7445; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
7446; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7447; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7448; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7449; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7450; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
7451; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7452; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7453; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
7454; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7455; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
7456; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7457; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7458; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7459; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7460; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
7461; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7462; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7463; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7464; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7465; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
7466; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7467; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7468; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7469; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7470; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
7471; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7472; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7473; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3]
7474; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7475; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
7476; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7477; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7478; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7479; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7480; SSE-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
7481; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7482; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7483; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7484; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7485; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7486; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
7487; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7488; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7489; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7490; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7491; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
7492; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7493; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7494; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7495; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7496; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7497; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
7498; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7499; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7500; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7501; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7502; SSE-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
7503; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7504; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7505; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7506; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
7507; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7508; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
7509; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7510; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7511; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7512; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7513; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
7514; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7515; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7516; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7517; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7518; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7519; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7520; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7521; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7522; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7523; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
7524; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7525; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
7526; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7527; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7528; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7529; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7530; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7531; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7532; SSE-NEXT:    # xmm3 = mem[2,2,3,3]
7533; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7534; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
7535; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
7536; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
7537; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7538; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7539; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7540; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7541; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7542; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7543; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7544; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7545; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
7546; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7547; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
7548; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7549; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7550; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7551; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7552; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7553; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7554; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7555; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7556; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
7557; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7558; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
7559; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7560; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7561; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7562; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7563; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7564; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
7565; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
7566; SSE-NEXT:    # xmm15 = mem[0,0,1,1]
7567; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
7568; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
7569; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7570; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7571; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7572; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7573; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7574; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7575; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7576; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7577; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
7578; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7579; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
7580; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7581; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7582; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7583; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7584; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7585; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3]
7586; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7587; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
7588; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7589; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
7590; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7591; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7592; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7593; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7594; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7595; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
7596; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7597; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
7598; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7599; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
7600; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7601; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7602; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7603; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7604; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7605; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
7606; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
7607; SSE-NEXT:    # xmm11 = mem[0,0,1,1]
7608; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
7609; SSE-NEXT:    movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1]
7610; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7611; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
7612; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7613; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7614; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
7615; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
7616; SSE-NEXT:    # xmm9 = mem[0,0,1,1]
7617; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
7618; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
7619; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3]
7620; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7621; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7622; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
7623; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
7624; SSE-NEXT:    # xmm8 = mem[0,0,1,1]
7625; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
7626; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
7627; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7628; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3]
7629; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7630; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7631; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3]
7632; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
7633; SSE-NEXT:    # xmm13 = mem[0,0,1,1]
7634; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
7635; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
7636; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
7637; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7638; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7639; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7640; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
7641; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
7642; SSE-NEXT:    # xmm7 = mem[0,0,1,1]
7643; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
7644; SSE-NEXT:    movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
7645; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7646; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
7647; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7648; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7649; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
7650; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
7651; SSE-NEXT:    # xmm5 = mem[0,0,1,1]
7652; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
7653; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
7654; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3]
7655; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7656; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7657; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7658; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
7659; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7660; SSE-NEXT:    # xmm3 = mem[0,0,1,1]
7661; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
7662; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
7663; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7664; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
7665; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7666; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7667; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3]
7668; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7669; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
7670; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7671; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
7672; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7673; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7674; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7675; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7676; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7677; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7678; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7679; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7680; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
7681; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
7682; SSE-NEXT:    movapd %xmm15, %xmm4
7683; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7684; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7685; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7686; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7687; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7688; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7689; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7690; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7691; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
7692; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
7693; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7694; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7695; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7696; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7697; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7698; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7699; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7700; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7701; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7702; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
7703; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
7704; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7705; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7706; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7707; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7708; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7709; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7710; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7711; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7712; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7713; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
7714; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
7715; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7716; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7717; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7718; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7719; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7720; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7721; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7722; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7723; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7724; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
7725; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
7726; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7727; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7728; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7729; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7730; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7731; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7732; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7733; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7734; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7735; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
7736; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
7737; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7738; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7739; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7740; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7741; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7742; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7743; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7744; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7745; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7746; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
7747; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
7748; SSE-NEXT:    movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7749; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7750; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7751; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7752; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7753; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7754; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7755; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7756; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7757; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
7758; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
7759; SSE-NEXT:    movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7760; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7761; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7762; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7763; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7764; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7765; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7766; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7767; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7768; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
7769; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
7770; SSE-NEXT:    movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7771; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7772; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7773; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7774; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7775; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7776; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7777; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7778; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7779; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
7780; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
7781; SSE-NEXT:    movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7782; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7783; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7784; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7785; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7786; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7787; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7788; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7789; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7790; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
7791; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
7792; SSE-NEXT:    movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7793; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7794; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7795; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3]
7796; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7797; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7798; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7799; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7800; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
7801; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
7802; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7803; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7804; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7805; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7806; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7807; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7808; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7809; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7810; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7811; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
7812; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
7813; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7814; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7815; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7816; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3]
7817; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7818; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7819; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7820; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7821; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
7822; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
7823; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7824; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7825; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7826; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
7827; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7828; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7829; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7830; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7831; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
7832; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
7833; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7834; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
7835; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
7836; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
7837; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7838; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
7839; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7840; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
7841; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
7842; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7843; SSE-NEXT:    movaps %xmm0, 224(%rsi)
7844; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7845; SSE-NEXT:    movaps %xmm0, 160(%rsi)
7846; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7847; SSE-NEXT:    movaps %xmm0, 96(%rsi)
7848; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7849; SSE-NEXT:    movaps %xmm0, 32(%rsi)
7850; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7851; SSE-NEXT:    movaps %xmm0, 240(%rsi)
7852; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7853; SSE-NEXT:    movaps %xmm0, 176(%rsi)
7854; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7855; SSE-NEXT:    movaps %xmm0, 112(%rsi)
7856; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7857; SSE-NEXT:    movaps %xmm0, 48(%rsi)
7858; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7859; SSE-NEXT:    movaps %xmm0, 192(%rsi)
7860; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7861; SSE-NEXT:    movaps %xmm0, 128(%rsi)
7862; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7863; SSE-NEXT:    movaps %xmm0, 64(%rsi)
7864; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7865; SSE-NEXT:    movaps %xmm0, (%rsi)
7866; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7867; SSE-NEXT:    movaps %xmm0, 208(%rsi)
7868; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7869; SSE-NEXT:    movaps %xmm0, 144(%rsi)
7870; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7871; SSE-NEXT:    movaps %xmm0, 80(%rsi)
7872; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7873; SSE-NEXT:    movaps %xmm0, 16(%rsi)
7874; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7875; SSE-NEXT:    movaps %xmm0, 224(%rdx)
7876; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7877; SSE-NEXT:    movaps %xmm0, 240(%rdx)
7878; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7879; SSE-NEXT:    movaps %xmm0, 192(%rdx)
7880; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7881; SSE-NEXT:    movaps %xmm0, 208(%rdx)
7882; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7883; SSE-NEXT:    movaps %xmm0, 160(%rdx)
7884; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7885; SSE-NEXT:    movaps %xmm0, 176(%rdx)
7886; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7887; SSE-NEXT:    movaps %xmm0, 128(%rdx)
7888; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7889; SSE-NEXT:    movaps %xmm0, 144(%rdx)
7890; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7891; SSE-NEXT:    movaps %xmm0, 96(%rdx)
7892; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7893; SSE-NEXT:    movaps %xmm0, 112(%rdx)
7894; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7895; SSE-NEXT:    movaps %xmm0, 64(%rdx)
7896; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7897; SSE-NEXT:    movaps %xmm0, 80(%rdx)
7898; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7899; SSE-NEXT:    movaps %xmm0, 32(%rdx)
7900; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7901; SSE-NEXT:    movaps %xmm0, 48(%rdx)
7902; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7903; SSE-NEXT:    movaps %xmm0, (%rdx)
7904; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7905; SSE-NEXT:    movaps %xmm0, 16(%rdx)
7906; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7907; SSE-NEXT:    movaps %xmm0, 240(%rcx)
7908; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7909; SSE-NEXT:    movaps %xmm0, 224(%rcx)
7910; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7911; SSE-NEXT:    movaps %xmm0, 208(%rcx)
7912; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7913; SSE-NEXT:    movaps %xmm0, 192(%rcx)
7914; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7915; SSE-NEXT:    movaps %xmm0, 176(%rcx)
7916; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7917; SSE-NEXT:    movaps %xmm0, 160(%rcx)
7918; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7919; SSE-NEXT:    movaps %xmm0, 144(%rcx)
7920; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7921; SSE-NEXT:    movaps %xmm0, 128(%rcx)
7922; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7923; SSE-NEXT:    movaps %xmm0, 112(%rcx)
7924; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7925; SSE-NEXT:    movaps %xmm0, 96(%rcx)
7926; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7927; SSE-NEXT:    movaps %xmm0, 80(%rcx)
7928; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7929; SSE-NEXT:    movaps %xmm0, 64(%rcx)
7930; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7931; SSE-NEXT:    movaps %xmm0, 48(%rcx)
7932; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7933; SSE-NEXT:    movaps %xmm0, 32(%rcx)
7934; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7935; SSE-NEXT:    movaps %xmm0, 16(%rcx)
7936; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7937; SSE-NEXT:    movaps %xmm0, (%rcx)
7938; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7939; SSE-NEXT:    movaps %xmm0, 240(%r8)
7940; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7941; SSE-NEXT:    movaps %xmm0, 224(%r8)
7942; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7943; SSE-NEXT:    movaps %xmm0, 208(%r8)
7944; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7945; SSE-NEXT:    movaps %xmm0, 192(%r8)
7946; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7947; SSE-NEXT:    movaps %xmm0, 176(%r8)
7948; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7949; SSE-NEXT:    movaps %xmm0, 160(%r8)
7950; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7951; SSE-NEXT:    movaps %xmm0, 144(%r8)
7952; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7953; SSE-NEXT:    movaps %xmm0, 128(%r8)
7954; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7955; SSE-NEXT:    movaps %xmm0, 112(%r8)
7956; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7957; SSE-NEXT:    movaps %xmm0, 96(%r8)
7958; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7959; SSE-NEXT:    movaps %xmm0, 80(%r8)
7960; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7961; SSE-NEXT:    movaps %xmm0, 64(%r8)
7962; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7963; SSE-NEXT:    movaps %xmm0, 48(%r8)
7964; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7965; SSE-NEXT:    movaps %xmm0, 32(%r8)
7966; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
7967; SSE-NEXT:    movaps %xmm0, 16(%r8)
7968; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7969; SSE-NEXT:    movaps %xmm0, (%r8)
7970; SSE-NEXT:    movapd %xmm2, 240(%r9)
7971; SSE-NEXT:    movapd %xmm3, 224(%r9)
7972; SSE-NEXT:    movapd %xmm5, 208(%r9)
7973; SSE-NEXT:    movapd %xmm7, 192(%r9)
7974; SSE-NEXT:    movapd %xmm13, 176(%r9)
7975; SSE-NEXT:    movapd %xmm8, 160(%r9)
7976; SSE-NEXT:    movapd %xmm9, 144(%r9)
7977; SSE-NEXT:    movapd %xmm11, 128(%r9)
7978; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7979; SSE-NEXT:    movaps %xmm0, 112(%r9)
7980; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7981; SSE-NEXT:    movaps %xmm0, 96(%r9)
7982; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7983; SSE-NEXT:    movaps %xmm0, 80(%r9)
7984; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7985; SSE-NEXT:    movaps %xmm0, 64(%r9)
7986; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7987; SSE-NEXT:    movaps %xmm0, 48(%r9)
7988; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7989; SSE-NEXT:    movaps %xmm0, 32(%r9)
7990; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7991; SSE-NEXT:    movaps %xmm0, 16(%r9)
7992; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7993; SSE-NEXT:    movaps %xmm0, (%r9)
7994; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7995; SSE-NEXT:    movapd %xmm14, 240(%rax)
7996; SSE-NEXT:    movapd %xmm12, 224(%rax)
7997; SSE-NEXT:    movapd %xmm15, 208(%rax)
7998; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7999; SSE-NEXT:    movaps %xmm0, 192(%rax)
8000; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8001; SSE-NEXT:    movaps %xmm0, 176(%rax)
8002; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8003; SSE-NEXT:    movaps %xmm0, 160(%rax)
8004; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8005; SSE-NEXT:    movaps %xmm0, 144(%rax)
8006; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8007; SSE-NEXT:    movaps %xmm0, 128(%rax)
8008; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8009; SSE-NEXT:    movaps %xmm0, 112(%rax)
8010; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8011; SSE-NEXT:    movaps %xmm0, 96(%rax)
8012; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8013; SSE-NEXT:    movaps %xmm0, 80(%rax)
8014; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8015; SSE-NEXT:    movaps %xmm0, 64(%rax)
8016; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8017; SSE-NEXT:    movaps %xmm0, 48(%rax)
8018; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8019; SSE-NEXT:    movaps %xmm0, 32(%rax)
8020; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8021; SSE-NEXT:    movaps %xmm0, 16(%rax)
8022; SSE-NEXT:    movapd %xmm4, (%rax)
8023; SSE-NEXT:    addq $2184, %rsp # imm = 0x888
8024; SSE-NEXT:    retq
8025;
8026; AVX-LABEL: load_i32_stride6_vf64:
8027; AVX:       # %bb.0:
8028; AVX-NEXT:    subq $2584, %rsp # imm = 0xA18
8029; AVX-NEXT:    vmovaps 608(%rdi), %ymm6
8030; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8031; AVX-NEXT:    vmovaps 672(%rdi), %ymm2
8032; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8033; AVX-NEXT:    vmovaps 640(%rdi), %ymm3
8034; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8035; AVX-NEXT:    vmovapd 352(%rdi), %ymm4
8036; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8037; AVX-NEXT:    vmovapd 320(%rdi), %ymm5
8038; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8039; AVX-NEXT:    vmovaps 224(%rdi), %ymm7
8040; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8041; AVX-NEXT:    vmovaps 192(%rdi), %ymm8
8042; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8043; AVX-NEXT:    vmovaps 288(%rdi), %ymm1
8044; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8045; AVX-NEXT:    vmovaps 256(%rdi), %ymm0
8046; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8047; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm0, %ymm9
8048; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8049; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4]
8050; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6]
8051; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
8052; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8053; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm9
8054; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3]
8055; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3]
8056; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8057; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1]
8058; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8059; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
8060; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
8061; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
8062; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8063; AVX-NEXT:    vinsertf128 $1, 672(%rdi), %ymm3, %ymm1
8064; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8065; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
8066; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6]
8067; AVX-NEXT:    vmovaps 576(%rdi), %ymm1
8068; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8069; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7]
8070; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8071; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
8072; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8073; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8074; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3]
8075; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8076; AVX-NEXT:    vmovapd 736(%rdi), %ymm1
8077; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8078; AVX-NEXT:    vmovapd 704(%rdi), %ymm2
8079; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8080; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
8081; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8082; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
8083; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
8084; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
8085; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8086; AVX-NEXT:    vmovaps 1056(%rdi), %ymm1
8087; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8088; AVX-NEXT:    vmovaps 1024(%rdi), %ymm0
8089; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8090; AVX-NEXT:    vinsertf128 $1, 1056(%rdi), %ymm0, %ymm2
8091; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8092; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4]
8093; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,2],ymm0[6,4],ymm2[6,6]
8094; AVX-NEXT:    vmovaps 992(%rdi), %ymm1
8095; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8096; AVX-NEXT:    vmovaps 960(%rdi), %ymm2
8097; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8098; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
8099; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8100; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm13
8101; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
8102; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm13[0,3]
8103; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8104; AVX-NEXT:    vmovapd 1120(%rdi), %ymm1
8105; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8106; AVX-NEXT:    vmovapd 1088(%rdi), %ymm2
8107; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8108; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
8109; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8110; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
8111; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
8112; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
8113; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8114; AVX-NEXT:    vmovaps 1440(%rdi), %ymm1
8115; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8116; AVX-NEXT:    vmovaps 1408(%rdi), %ymm0
8117; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8118; AVX-NEXT:    vinsertf128 $1, 1440(%rdi), %ymm0, %ymm11
8119; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm11[0,0],ymm1[6,4],ymm11[4,4]
8120; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,2],ymm0[6,4],ymm11[6,6]
8121; AVX-NEXT:    vmovaps 1376(%rdi), %ymm1
8122; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8123; AVX-NEXT:    vmovaps 1344(%rdi), %ymm2
8124; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8125; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
8126; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8127; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm10
8128; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
8129; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,3]
8130; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8131; AVX-NEXT:    vmovapd 1504(%rdi), %ymm1
8132; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8133; AVX-NEXT:    vmovapd 1472(%rdi), %ymm2
8134; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8135; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
8136; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8137; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
8138; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
8139; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
8140; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8141; AVX-NEXT:    vmovaps 96(%rdi), %ymm1
8142; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8143; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
8144; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8145; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm0, %ymm8
8146; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm8[0,0],ymm1[6,4],ymm8[4,4]
8147; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,2],ymm0[6,4],ymm8[6,6]
8148; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
8149; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8150; AVX-NEXT:    vmovaps (%rdi), %ymm2
8151; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8152; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
8153; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8154; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm7
8155; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
8156; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3]
8157; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8158; AVX-NEXT:    vmovapd 160(%rdi), %ymm1
8159; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8160; AVX-NEXT:    vmovapd 128(%rdi), %ymm2
8161; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8162; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
8163; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8164; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
8165; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
8166; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
8167; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8168; AVX-NEXT:    vmovaps 480(%rdi), %ymm1
8169; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8170; AVX-NEXT:    vmovaps 448(%rdi), %ymm0
8171; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8172; AVX-NEXT:    vinsertf128 $1, 480(%rdi), %ymm0, %ymm6
8173; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm6[0,0],ymm1[6,4],ymm6[4,4]
8174; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6]
8175; AVX-NEXT:    vmovaps 416(%rdi), %ymm1
8176; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8177; AVX-NEXT:    vmovaps 384(%rdi), %ymm2
8178; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8179; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
8180; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8181; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm4
8182; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
8183; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3]
8184; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8185; AVX-NEXT:    vmovapd 544(%rdi), %ymm1
8186; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8187; AVX-NEXT:    vmovapd 512(%rdi), %ymm2
8188; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8189; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
8190; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8191; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
8192; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
8193; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
8194; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8195; AVX-NEXT:    vmovaps 864(%rdi), %ymm1
8196; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8197; AVX-NEXT:    vmovaps 832(%rdi), %ymm0
8198; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8199; AVX-NEXT:    vinsertf128 $1, 864(%rdi), %ymm0, %ymm3
8200; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[0,0],ymm1[6,4],ymm3[4,4]
8201; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,2],ymm0[6,4],ymm3[6,6]
8202; AVX-NEXT:    vmovaps 800(%rdi), %ymm1
8203; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8204; AVX-NEXT:    vmovaps 768(%rdi), %ymm2
8205; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8206; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
8207; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8208; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
8209; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8210; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3]
8211; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8212; AVX-NEXT:    vmovapd 928(%rdi), %ymm1
8213; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8214; AVX-NEXT:    vmovapd 896(%rdi), %ymm5
8215; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8216; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1]
8217; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8218; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
8219; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
8220; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
8221; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8222; AVX-NEXT:    vmovaps 1248(%rdi), %ymm5
8223; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8224; AVX-NEXT:    vmovaps 1216(%rdi), %ymm0
8225; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8226; AVX-NEXT:    vinsertf128 $1, 1248(%rdi), %ymm0, %ymm1
8227; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8228; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm1[0,0],ymm5[6,4],ymm1[4,4]
8229; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6]
8230; AVX-NEXT:    vmovaps 1184(%rdi), %ymm0
8231; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8232; AVX-NEXT:    vmovaps 1152(%rdi), %ymm5
8233; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8234; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7]
8235; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8236; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm5
8237; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm5[2,3]
8238; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm5[0,3]
8239; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
8240; AVX-NEXT:    vmovapd 1312(%rdi), %ymm12
8241; AVX-NEXT:    vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8242; AVX-NEXT:    vmovapd 1280(%rdi), %ymm0
8243; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8244; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[0,1]
8245; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8246; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
8247; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
8248; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
8249; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8250; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8251; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
8252; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4]
8253; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7]
8254; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8255; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm9[3,0]
8256; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm9[1,3]
8257; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7]
8258; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
8259; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8260; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7]
8261; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
8262; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm15[6,7]
8263; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8264; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
8265; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8266; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4]
8267; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7]
8268; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8269; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8270; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm1[3,0]
8271; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[1,3]
8272; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7]
8273; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8274; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
8275; AVX-NEXT:    # ymm15 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7]
8276; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
8277; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
8278; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8279; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8280; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8281; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4]
8282; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,3],ymm14[6,4],ymm1[6,7]
8283; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8284; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm13[3,0]
8285; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm15[0,2],xmm13[1,3]
8286; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
8287; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
8288; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8289; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[3,1],ymm15[1,3],ymm0[7,5],ymm15[5,7]
8290; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
8291; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
8292; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8293; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
8294; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[3,0],ymm11[1,0],ymm14[7,4],ymm11[5,4]
8295; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[2,0],ymm11[2,3],ymm13[6,4],ymm11[6,7]
8296; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8297; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm0[1,0],xmm10[3,0]
8298; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm13[0,2],xmm10[1,3]
8299; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7]
8300; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8301; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8302; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[3,1],ymm13[1,3],ymm0[7,5],ymm13[5,7]
8303; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
8304; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
8305; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8306; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8307; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm11[3,0],ymm8[1,0],ymm11[7,4],ymm8[5,4]
8308; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7]
8309; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8310; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm0[1,0],xmm7[3,0]
8311; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm10[0,2],xmm7[1,3]
8312; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7]
8313; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8314; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
8315; AVX-NEXT:    # ymm8 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7]
8316; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
8317; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
8318; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8319; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
8320; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4]
8321; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7]
8322; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8323; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm0[1,0],xmm4[3,0]
8324; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm4[1,3]
8325; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
8326; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
8327; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8328; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm7[1,3],ymm0[7,5],ymm7[5,7]
8329; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
8330; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
8331; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8332; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8333; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4]
8334; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7]
8335; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8336; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm0[1,0],xmm2[3,0]
8337; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3]
8338; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
8339; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8340; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8341; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm4[1,3],ymm0[7,5],ymm4[5,7]
8342; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
8343; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
8344; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8345; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8346; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8347; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4]
8348; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
8349; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8350; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm5[3,0]
8351; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm5[1,3]
8352; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
8353; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8354; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
8355; AVX-NEXT:    # ymm1 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7]
8356; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
8357; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
8358; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8359; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8360; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8361; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
8362; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8363; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8364; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8365; AVX-NEXT:    # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4]
8366; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
8367; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
8368; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8369; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
8370; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8371; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
8372; AVX-NEXT:    # ymm12 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
8373; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1]
8374; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8375; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm12[2,0],ymm1[4,4],ymm12[6,4]
8376; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8377; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8378; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8379; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8380; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
8381; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8382; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
8383; AVX-NEXT:    # ymm0 = ymm9[2,1],mem[2,0],ymm9[6,5],mem[6,4]
8384; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
8385; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
8386; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8387; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
8388; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8389; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8390; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
8391; AVX-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
8392; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8393; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
8394; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8395; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4]
8396; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8397; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8398; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8399; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8400; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
8401; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8402; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8403; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8404; AVX-NEXT:    # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4]
8405; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
8406; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
8407; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8408; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
8409; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8410; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
8411; AVX-NEXT:    # ymm2 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7]
8412; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8413; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
8414; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8415; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4]
8416; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8417; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8418; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8419; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8420; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
8421; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8422; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
8423; AVX-NEXT:    # ymm0 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4]
8424; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
8425; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
8426; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8427; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
8428; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8429; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload
8430; AVX-NEXT:    # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7]
8431; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1]
8432; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8433; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4]
8434; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8435; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8436; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8437; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8438; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
8439; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8440; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
8441; AVX-NEXT:    # ymm0 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4]
8442; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
8443; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
8444; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8445; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
8446; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8447; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8448; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
8449; AVX-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
8450; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8451; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
8452; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8453; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4]
8454; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8455; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8456; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8457; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8458; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
8459; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8460; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
8461; AVX-NEXT:    # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4]
8462; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
8463; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
8464; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8465; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
8466; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8467; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
8468; AVX-NEXT:    # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
8469; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1]
8470; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8471; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[2,0],ymm1[4,4],ymm9[6,4]
8472; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8473; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8474; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8475; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
8476; AVX-NEXT:    # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
8477; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
8478; AVX-NEXT:    # ymm0 = ymm6[2,1],mem[2,0],ymm6[6,5],mem[6,4]
8479; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
8480; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm1
8481; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8482; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3]
8483; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
8484; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
8485; AVX-NEXT:    # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7]
8486; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1]
8487; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm6[2,0],ymm13[4,4],ymm6[6,4]
8488; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
8489; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8490; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8491; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
8492; AVX-NEXT:    # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
8493; AVX-NEXT:    vmovaps %ymm3, %ymm4
8494; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8495; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[2,1],ymm3[2,0],ymm4[6,5],ymm3[6,4]
8496; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
8497; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm7
8498; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm5[2,0],xmm7[2,3]
8499; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
8500; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8501; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
8502; AVX-NEXT:    # ymm15 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
8503; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1]
8504; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm15[2,0],ymm2[4,4],ymm15[6,4]
8505; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
8506; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8507; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8508; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm12[3,1],ymm0[4,5],ymm12[7,5]
8509; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8510; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
8511; AVX-NEXT:    # xmm11 = xmm1[3,1],mem[3,3]
8512; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
8513; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8514; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[3,1],ymm14[2,1],ymm1[7,5],ymm14[6,5]
8515; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
8516; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
8517; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7]
8518; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8519; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8520; AVX-NEXT:    vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8521; AVX-NEXT:    # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
8522; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8523; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
8524; AVX-NEXT:    # xmm11 = xmm1[3,1],mem[3,3]
8525; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
8526; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
8527; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5]
8528; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
8529; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
8530; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7]
8531; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8532; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8533; AVX-NEXT:    vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
8534; AVX-NEXT:    # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
8535; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8536; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
8537; AVX-NEXT:    # xmm11 = xmm0[3,1],mem[3,3]
8538; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8539; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
8540; AVX-NEXT:    # ymm12 = ymm0[3,1],mem[2,1],ymm0[7,5],mem[6,5]
8541; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
8542; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
8543; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm1[5,6,7]
8544; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8545; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8546; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm10[3,1],ymm0[4,5],ymm10[7,5]
8547; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8548; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload
8549; AVX-NEXT:    # xmm10 = xmm1[3,1],mem[3,3]
8550; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8551; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
8552; AVX-NEXT:    # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5]
8553; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1]
8554; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7]
8555; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
8556; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8557; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm15[3,1],ymm2[4,5],ymm15[7,5]
8558; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[3,1],xmm7[3,3]
8559; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5]
8560; AVX-NEXT:    vmovaps %ymm4, %ymm15
8561; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
8562; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
8563; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
8564; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8565; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5]
8566; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
8567; AVX-NEXT:    # xmm1 = xmm8[3,1],mem[3,3]
8568; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8569; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8570; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5]
8571; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
8572; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
8573; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
8574; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8575; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8576; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5]
8577; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8578; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8579; AVX-NEXT:    # xmm1 = xmm1[3,1],mem[3,3]
8580; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8581; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
8582; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5]
8583; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
8584; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
8585; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
8586; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8587; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8588; AVX-NEXT:    vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8589; AVX-NEXT:    # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
8590; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8591; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8592; AVX-NEXT:    # xmm1 = xmm1[3,1],mem[3,3]
8593; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
8594; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
8595; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5]
8596; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
8597; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
8598; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
8599; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8600; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8601; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8602; AVX-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
8603; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
8604; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8605; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
8606; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
8607; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8608; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
8609; AVX-NEXT:    vmovapd 80(%rdi), %xmm1
8610; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8611; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[3]
8612; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
8613; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
8614; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8615; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,0,1]
8616; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm10[0,0],ymm2[6,4],ymm10[4,4]
8617; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm1[2,0],ymm10[4,6],ymm1[6,4]
8618; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8619; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8620; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8621; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8622; AVX-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
8623; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8624; AVX-NEXT:    vmovaps 224(%rdi), %xmm0
8625; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8626; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
8627; AVX-NEXT:    vmovaps 208(%rdi), %xmm1
8628; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8629; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
8630; AVX-NEXT:    vmovapd 272(%rdi), %xmm1
8631; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8632; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[3]
8633; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
8634; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
8635; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
8636; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
8637; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4]
8638; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4]
8639; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8640; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8641; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8642; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8643; AVX-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
8644; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8645; AVX-NEXT:    vmovaps 416(%rdi), %xmm0
8646; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8647; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
8648; AVX-NEXT:    vmovaps 400(%rdi), %xmm1
8649; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8650; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
8651; AVX-NEXT:    vmovapd 464(%rdi), %xmm1
8652; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8653; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[3]
8654; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
8655; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
8656; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1]
8657; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4]
8658; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm1[2,0],ymm5[4,6],ymm1[6,4]
8659; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8660; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8661; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8662; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
8663; AVX-NEXT:    # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
8664; AVX-NEXT:    vmovaps 608(%rdi), %xmm0
8665; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8666; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
8667; AVX-NEXT:    vmovaps 592(%rdi), %xmm1
8668; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8669; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
8670; AVX-NEXT:    vmovapd 656(%rdi), %xmm1
8671; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8672; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8673; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
8674; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8675; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
8676; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
8677; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm12[2,3,0,1]
8678; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm4[0,0],ymm12[6,4],ymm4[4,4]
8679; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm1[2,0],ymm4[4,6],ymm1[6,4]
8680; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8681; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8682; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8683; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
8684; AVX-NEXT:    # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
8685; AVX-NEXT:    vmovaps 800(%rdi), %xmm0
8686; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8687; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
8688; AVX-NEXT:    vmovaps 784(%rdi), %xmm1
8689; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8690; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
8691; AVX-NEXT:    vmovapd 848(%rdi), %xmm1
8692; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8693; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
8694; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4]
8695; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
8696; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1]
8697; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4]
8698; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4]
8699; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8700; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8701; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8702; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
8703; AVX-NEXT:    # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
8704; AVX-NEXT:    vmovaps 992(%rdi), %xmm0
8705; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8706; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
8707; AVX-NEXT:    vmovaps 976(%rdi), %xmm1
8708; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8709; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
8710; AVX-NEXT:    vmovapd 1040(%rdi), %xmm1
8711; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8712; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8713; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
8714; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8715; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
8716; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
8717; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm14[2,3,0,1]
8718; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm2[0,0],ymm14[6,4],ymm2[4,4]
8719; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
8720; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8721; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8722; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8723; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
8724; AVX-NEXT:    # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
8725; AVX-NEXT:    vmovaps 1184(%rdi), %xmm0
8726; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8727; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
8728; AVX-NEXT:    vmovaps 1168(%rdi), %xmm1
8729; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8730; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
8731; AVX-NEXT:    vmovapd 1232(%rdi), %xmm1
8732; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8733; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8734; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
8735; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4]
8736; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
8737; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
8738; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4]
8739; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4]
8740; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7]
8741; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8742; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8743; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
8744; AVX-NEXT:    # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
8745; AVX-NEXT:    vmovaps 1376(%rdi), %xmm0
8746; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8747; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
8748; AVX-NEXT:    vmovaps 1360(%rdi), %xmm9
8749; AVX-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8750; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3]
8751; AVX-NEXT:    vmovapd 1424(%rdi), %xmm9
8752; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8753; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
8754; AVX-NEXT:    # ymm15 = ymm9[1],mem[0],ymm9[2],mem[3]
8755; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
8756; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,0],ymm9[4,5],ymm15[6,4]
8757; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7]
8758; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1]
8759; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4]
8760; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4]
8761; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm13[5,6,7]
8762; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8763; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
8764; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm9[3,0],ymm10[1,0],ymm9[7,4],ymm10[5,4]
8765; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,3],ymm13[2,0],ymm10[4,7],ymm13[6,4]
8766; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8767; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload
8768; AVX-NEXT:    # xmm13 = xmm10[0,1],mem[2,3]
8769; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8770; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
8771; AVX-NEXT:    # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
8772; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8773; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
8774; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
8775; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
8776; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7]
8777; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8778; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm7[1,0],ymm10[7,4],ymm7[5,4]
8779; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm13[2,0],ymm7[4,7],ymm13[6,4]
8780; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8781; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload
8782; AVX-NEXT:    # xmm13 = xmm10[0,1],mem[2,3]
8783; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8784; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
8785; AVX-NEXT:    # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
8786; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8787; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
8788; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
8789; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
8790; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7]
8791; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8792; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm5[1,0],ymm10[7,4],ymm5[5,4]
8793; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm13[2,0],ymm5[4,7],ymm13[6,4]
8794; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8795; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload
8796; AVX-NEXT:    # xmm13 = xmm10[0,1],mem[2,3]
8797; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8798; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
8799; AVX-NEXT:    # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
8800; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8801; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
8802; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
8803; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
8804; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7]
8805; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4]
8806; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4]
8807; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8808; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
8809; AVX-NEXT:    # xmm12 = xmm10[0,1],mem[2,3]
8810; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8811; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload
8812; AVX-NEXT:    # ymm13 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
8813; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8814; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm10[1,1],ymm13[2,0],ymm10[5,5],ymm13[6,4]
8815; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3]
8816; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
8817; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7]
8818; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4]
8819; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm11[2,0],ymm3[4,7],ymm11[6,4]
8820; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8821; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload
8822; AVX-NEXT:    # xmm11 = xmm10[0,1],mem[2,3]
8823; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8824; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload
8825; AVX-NEXT:    # ymm12 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
8826; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8827; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm10[1,1],ymm12[2,0],ymm10[5,5],ymm12[6,4]
8828; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3]
8829; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
8830; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7]
8831; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm14[3,0],ymm2[1,0],ymm14[7,4],ymm2[5,4]
8832; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm10[2,0],ymm2[4,7],ymm10[6,4]
8833; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8834; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
8835; AVX-NEXT:    # xmm10 = xmm10[0,1],mem[2,3]
8836; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8837; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
8838; AVX-NEXT:    # ymm11 = ymm11[3,1],mem[1,3],ymm11[7,5],mem[5,7]
8839; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
8840; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4]
8841; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3]
8842; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7]
8843; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7]
8844; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4]
8845; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4]
8846; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8847; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
8848; AVX-NEXT:    # xmm8 = xmm8[0,1],mem[2,3]
8849; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8850; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
8851; AVX-NEXT:    # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
8852; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8853; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4]
8854; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3]
8855; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7]
8856; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7]
8857; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm0[1,0],ymm6[7,4],ymm0[5,4]
8858; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm6[2,0],ymm0[4,7],ymm6[6,4]
8859; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8860; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
8861; AVX-NEXT:    # xmm6 = xmm6[0,1],mem[2,3]
8862; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
8863; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
8864; AVX-NEXT:    # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
8865; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
8866; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[1,1],ymm8[2,0],ymm10[5,5],ymm8[6,4]
8867; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3]
8868; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7]
8869; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7]
8870; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8871; AVX-NEXT:    vmovaps %ymm6, 192(%rsi)
8872; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8873; AVX-NEXT:    vmovaps %ymm6, 128(%rsi)
8874; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8875; AVX-NEXT:    vmovaps %ymm6, 64(%rsi)
8876; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8877; AVX-NEXT:    vmovaps %ymm6, (%rsi)
8878; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8879; AVX-NEXT:    vmovaps %ymm6, 224(%rsi)
8880; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8881; AVX-NEXT:    vmovaps %ymm6, 160(%rsi)
8882; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8883; AVX-NEXT:    vmovaps %ymm6, 96(%rsi)
8884; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8885; AVX-NEXT:    vmovaps %ymm6, 32(%rsi)
8886; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8887; AVX-NEXT:    vmovaps %ymm6, 192(%rdx)
8888; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8889; AVX-NEXT:    vmovaps %ymm6, 128(%rdx)
8890; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8891; AVX-NEXT:    vmovaps %ymm6, 64(%rdx)
8892; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8893; AVX-NEXT:    vmovaps %ymm6, (%rdx)
8894; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8895; AVX-NEXT:    vmovaps %ymm6, 224(%rdx)
8896; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8897; AVX-NEXT:    vmovaps %ymm6, 160(%rdx)
8898; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8899; AVX-NEXT:    vmovaps %ymm6, 96(%rdx)
8900; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8901; AVX-NEXT:    vmovaps %ymm6, 32(%rdx)
8902; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8903; AVX-NEXT:    vmovaps %ymm6, 192(%rcx)
8904; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8905; AVX-NEXT:    vmovaps %ymm6, 128(%rcx)
8906; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8907; AVX-NEXT:    vmovaps %ymm6, 64(%rcx)
8908; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8909; AVX-NEXT:    vmovaps %ymm6, (%rcx)
8910; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8911; AVX-NEXT:    vmovaps %ymm6, 224(%rcx)
8912; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8913; AVX-NEXT:    vmovaps %ymm6, 160(%rcx)
8914; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8915; AVX-NEXT:    vmovaps %ymm6, 96(%rcx)
8916; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8917; AVX-NEXT:    vmovaps %ymm6, 32(%rcx)
8918; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8919; AVX-NEXT:    vmovaps %ymm6, (%r8)
8920; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8921; AVX-NEXT:    vmovaps %ymm6, 64(%r8)
8922; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8923; AVX-NEXT:    vmovaps %ymm6, 128(%r8)
8924; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8925; AVX-NEXT:    vmovaps %ymm6, 192(%r8)
8926; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8927; AVX-NEXT:    vmovaps %ymm6, 224(%r8)
8928; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8929; AVX-NEXT:    vmovaps %ymm6, 160(%r8)
8930; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8931; AVX-NEXT:    vmovaps %ymm6, 96(%r8)
8932; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8933; AVX-NEXT:    vmovaps %ymm6, 32(%r8)
8934; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8935; AVX-NEXT:    vmovaps %ymm6, 224(%r9)
8936; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8937; AVX-NEXT:    vmovaps %ymm6, 192(%r9)
8938; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8939; AVX-NEXT:    vmovaps %ymm6, 160(%r9)
8940; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8941; AVX-NEXT:    vmovaps %ymm6, 128(%r9)
8942; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8943; AVX-NEXT:    vmovaps %ymm6, 96(%r9)
8944; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8945; AVX-NEXT:    vmovaps %ymm6, 64(%r9)
8946; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8947; AVX-NEXT:    vmovaps %ymm6, 32(%r9)
8948; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8949; AVX-NEXT:    vmovaps %ymm6, (%r9)
8950; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8951; AVX-NEXT:    vmovaps %ymm0, 224(%rax)
8952; AVX-NEXT:    vmovaps %ymm1, 192(%rax)
8953; AVX-NEXT:    vmovaps %ymm2, 160(%rax)
8954; AVX-NEXT:    vmovaps %ymm3, 128(%rax)
8955; AVX-NEXT:    vmovaps %ymm4, 96(%rax)
8956; AVX-NEXT:    vmovaps %ymm5, 64(%rax)
8957; AVX-NEXT:    vmovaps %ymm7, 32(%rax)
8958; AVX-NEXT:    vmovaps %ymm9, (%rax)
8959; AVX-NEXT:    addq $2584, %rsp # imm = 0xA18
8960; AVX-NEXT:    vzeroupper
8961; AVX-NEXT:    retq
8962;
8963; AVX2-LABEL: load_i32_stride6_vf64:
8964; AVX2:       # %bb.0:
8965; AVX2-NEXT:    subq $2568, %rsp # imm = 0xA08
8966; AVX2-NEXT:    vmovaps 672(%rdi), %ymm4
8967; AVX2-NEXT:    vmovaps 640(%rdi), %ymm5
8968; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8969; AVX2-NEXT:    vmovaps 608(%rdi), %ymm3
8970; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8971; AVX2-NEXT:    vmovaps 320(%rdi), %ymm6
8972; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8973; AVX2-NEXT:    vmovaps 352(%rdi), %ymm7
8974; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8975; AVX2-NEXT:    vmovaps 288(%rdi), %ymm2
8976; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8977; AVX2-NEXT:    vmovaps 256(%rdi), %ymm8
8978; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8979; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
8980; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8981; AVX2-NEXT:    vmovaps 192(%rdi), %ymm1
8982; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8983; AVX2-NEXT:    vmovaps {{.*#+}} xmm9 = [0,6,4,u]
8984; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
8985; AVX2-NEXT:    vpermps %ymm14, %ymm9, %ymm0
8986; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1]
8987; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7]
8988; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6]
8989; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
8990; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
8991; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8992; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2]
8993; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm2
8994; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
8995; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8996; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8997; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm4[0,1]
8998; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7]
8999; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9000; AVX2-NEXT:    vmovaps 576(%rdi), %ymm0
9001; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9002; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
9003; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9004; AVX2-NEXT:    vpermps %ymm0, %ymm9, %ymm0
9005; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6]
9006; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9007; AVX2-NEXT:    vmovaps 704(%rdi), %ymm1
9008; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9009; AVX2-NEXT:    vmovaps 736(%rdi), %ymm2
9010; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9011; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9012; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9013; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm2
9014; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9015; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9016; AVX2-NEXT:    vmovaps 1056(%rdi), %ymm1
9017; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9018; AVX2-NEXT:    vmovaps 1024(%rdi), %ymm0
9019; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9020; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9021; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9022; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9023; AVX2-NEXT:    vmovaps 992(%rdi), %ymm0
9024; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9025; AVX2-NEXT:    vmovaps 960(%rdi), %ymm1
9026; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9027; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9028; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9029; AVX2-NEXT:    vpermps %ymm0, %ymm9, %ymm0
9030; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
9031; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9032; AVX2-NEXT:    vmovaps 1088(%rdi), %ymm1
9033; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9034; AVX2-NEXT:    vmovaps 1120(%rdi), %ymm2
9035; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9036; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9037; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9038; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm2
9039; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9040; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9041; AVX2-NEXT:    vmovaps 1440(%rdi), %ymm1
9042; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9043; AVX2-NEXT:    vmovaps 1408(%rdi), %ymm0
9044; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9045; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9046; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9047; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9048; AVX2-NEXT:    vmovaps 1376(%rdi), %ymm0
9049; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9050; AVX2-NEXT:    vmovaps 1344(%rdi), %ymm1
9051; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9052; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9053; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9054; AVX2-NEXT:    vpermps %ymm0, %ymm9, %ymm0
9055; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
9056; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9057; AVX2-NEXT:    vmovaps 1472(%rdi), %ymm1
9058; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9059; AVX2-NEXT:    vmovaps 1504(%rdi), %ymm2
9060; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9061; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9062; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9063; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm2
9064; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9065; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9066; AVX2-NEXT:    vmovaps 96(%rdi), %ymm1
9067; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
9068; AVX2-NEXT:    vmovaps 64(%rdi), %ymm0
9069; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9070; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9071; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9072; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9073; AVX2-NEXT:    vmovaps (%rdi), %ymm0
9074; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9075; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
9076; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9077; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9078; AVX2-NEXT:    vpermps %ymm13, %ymm9, %ymm0
9079; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
9080; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9081; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
9082; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9083; AVX2-NEXT:    vmovaps 160(%rdi), %ymm2
9084; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9085; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9086; AVX2-NEXT:    vpermps %ymm12, %ymm6, %ymm2
9087; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9088; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9089; AVX2-NEXT:    vmovaps 480(%rdi), %ymm1
9090; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9091; AVX2-NEXT:    vmovaps 448(%rdi), %ymm0
9092; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9093; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9094; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9095; AVX2-NEXT:    vmovaps 416(%rdi), %ymm0
9096; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9097; AVX2-NEXT:    vmovaps 384(%rdi), %ymm1
9098; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9099; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9100; AVX2-NEXT:    vpermps %ymm10, %ymm9, %ymm0
9101; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[0,2,2,2,4,6,6,6]
9102; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9103; AVX2-NEXT:    vmovaps 512(%rdi), %ymm1
9104; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9105; AVX2-NEXT:    vmovaps 544(%rdi), %ymm2
9106; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9107; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9108; AVX2-NEXT:    vpermps %ymm8, %ymm6, %ymm2
9109; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9110; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9111; AVX2-NEXT:    vmovaps 864(%rdi), %ymm1
9112; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9113; AVX2-NEXT:    vmovaps 832(%rdi), %ymm0
9114; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9115; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9116; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9117; AVX2-NEXT:    vmovaps 800(%rdi), %ymm0
9118; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9119; AVX2-NEXT:    vmovaps 768(%rdi), %ymm1
9120; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9121; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9122; AVX2-NEXT:    vpermps %ymm4, %ymm9, %ymm0
9123; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6]
9124; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9125; AVX2-NEXT:    vmovaps 896(%rdi), %ymm1
9126; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9127; AVX2-NEXT:    vmovaps 928(%rdi), %ymm2
9128; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9129; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9130; AVX2-NEXT:    vpermps %ymm3, %ymm6, %ymm2
9131; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9132; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9133; AVX2-NEXT:    vmovaps 1184(%rdi), %ymm0
9134; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9135; AVX2-NEXT:    vmovaps 1152(%rdi), %ymm1
9136; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9137; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9138; AVX2-NEXT:    vpermps %ymm2, %ymm9, %ymm0
9139; AVX2-NEXT:    vmovaps 1248(%rdi), %ymm1
9140; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9141; AVX2-NEXT:    vmovaps 1216(%rdi), %ymm5
9142; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9143; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm1[0,1]
9144; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7]
9145; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm9[0,2,2,2,4,6,6,6]
9146; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
9147; AVX2-NEXT:    vmovaps 1280(%rdi), %ymm1
9148; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9149; AVX2-NEXT:    vmovaps 1312(%rdi), %ymm5
9150; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9151; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
9152; AVX2-NEXT:    vpermps %ymm5, %ymm6, %ymm1
9153; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9154; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9155; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
9156; AVX2-NEXT:    vpermps %ymm14, %ymm0, %ymm1
9157; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7]
9158; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7]
9159; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3]
9160; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
9161; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
9162; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9163; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9164; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
9165; AVX2-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
9166; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
9167; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
9168; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
9169; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9170; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9171; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
9172; AVX2-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
9173; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
9174; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
9175; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
9176; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9177; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9178; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
9179; AVX2-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
9180; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
9181; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
9182; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
9183; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9184; AVX2-NEXT:    vpermps %ymm13, %ymm0, %ymm13
9185; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
9186; AVX2-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
9187; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
9188; AVX2-NEXT:    vpermps %ymm12, %ymm1, %ymm12
9189; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
9190; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9191; AVX2-NEXT:    vpermps %ymm10, %ymm0, %ymm10
9192; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
9193; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7]
9194; AVX2-NEXT:    vpermps %ymm8, %ymm1, %ymm8
9195; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
9196; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9197; AVX2-NEXT:    vpermps %ymm4, %ymm0, %ymm4
9198; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7]
9199; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7]
9200; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm3
9201; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
9202; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9203; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm0
9204; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[1,3,2,3,5,7,6,7]
9205; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9206; AVX2-NEXT:    vpermps %ymm5, %ymm1, %ymm2
9207; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9208; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9209; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
9210; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9211; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
9212; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
9213; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9214; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9215; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9216; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
9217; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9218; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
9219; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9220; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9221; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9222; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9223; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
9224; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9225; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
9226; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9227; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9228; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9229; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9230; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9231; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
9232; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
9233; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9234; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9235; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9236; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
9237; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9238; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
9239; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9240; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9241; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9242; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9243; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
9244; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9245; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
9246; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9247; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9248; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9249; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9250; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
9251; AVX2-NEXT:    # ymm0 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
9252; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
9253; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9254; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9255; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9256; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
9257; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9258; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
9259; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9260; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9261; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9262; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9263; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
9264; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9265; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
9266; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9267; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9268; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9269; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9270; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9271; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
9272; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
9273; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9274; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9275; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9276; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
9277; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9278; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
9279; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9280; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9281; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9282; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9283; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
9284; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9285; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
9286; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9287; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9288; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9289; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
9290; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9291; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
9292; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
9293; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9294; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9295; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9296; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9297; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9298; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
9299; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9300; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9301; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9302; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9303; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
9304; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9305; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
9306; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9307; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9308; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9309; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9310; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9311; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
9312; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
9313; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[0,2,0,3]
9314; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9315; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9316; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
9317; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9318; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[2,0,2,3,6,4,6,7]
9319; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
9320; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
9321; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9322; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9323; AVX2-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
9324; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9325; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0,2,0,4,4,6,4]
9326; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
9327; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
9328; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9329; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9330; AVX2-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
9331; AVX2-NEXT:    # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
9332; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
9333; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
9334; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9335; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9336; AVX2-NEXT:    # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
9337; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[2,0,2,3,6,4,6,7]
9338; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
9339; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
9340; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9341; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
9342; AVX2-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
9343; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[0,0,2,0,4,4,6,4]
9344; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
9345; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
9346; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9347; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
9348; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
9349; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7]
9350; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
9351; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[0,2,0,3]
9352; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9353; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9354; AVX2-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
9355; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[2,0,2,3,6,4,6,7]
9356; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
9357; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
9358; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9359; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
9360; AVX2-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
9361; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[0,0,2,0,4,4,6,4]
9362; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
9363; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
9364; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9365; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
9366; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
9367; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9368; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9369; AVX2-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
9370; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
9371; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
9372; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9373; AVX2-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
9374; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
9375; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
9376; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9377; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
9378; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
9379; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9380; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9381; AVX2-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
9382; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
9383; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
9384; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9385; AVX2-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
9386; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
9387; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
9388; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9389; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7]
9390; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9391; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
9392; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9393; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9394; AVX2-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
9395; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
9396; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
9397; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9398; AVX2-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
9399; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
9400; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
9401; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9402; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9403; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
9404; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9405; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
9406; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9407; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9408; AVX2-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
9409; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
9410; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
9411; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9412; AVX2-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
9413; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
9414; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
9415; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9416; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
9417; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7]
9418; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9419; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,1,3,3,7,5,7,7]
9420; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9421; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9422; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,7,5]
9423; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9424; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9425; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9426; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9427; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
9428; AVX2-NEXT:    vmovaps %ymm13, %ymm5
9429; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
9430; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9431; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[3,1,3,3,7,5,7,7]
9432; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9433; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9434; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,3,1,4,5,7,5]
9435; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9436; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9437; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9438; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9439; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[3,3,3,3,7,7,7,7]
9440; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
9441; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
9442; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9443; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9444; AVX2-NEXT:    # ymm2 = mem[3,1,3,3,7,5,7,7]
9445; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9446; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9447; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9448; AVX2-NEXT:    # ymm2 = mem[0,1,3,1,4,5,7,5]
9449; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9450; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9451; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9452; AVX2-NEXT:    vmovups (%rsp), %ymm14 # 32-byte Reload
9453; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[3,3,3,3,7,7,7,7]
9454; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9455; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
9456; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
9457; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9458; AVX2-NEXT:    # ymm2 = mem[3,1,3,3,7,5,7,7]
9459; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
9460; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
9461; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9462; AVX2-NEXT:    # ymm2 = mem[0,1,3,1,4,5,7,5]
9463; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
9464; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9465; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9466; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm14[4,5,6,7]
9467; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9468; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9469; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9470; AVX2-NEXT:    # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7]
9471; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9472; AVX2-NEXT:    vmovaps 80(%rdi), %xmm0
9473; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9474; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
9475; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
9476; AVX2-NEXT:    vpermps %ymm3, %ymm6, %ymm2
9477; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
9478; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9479; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9480; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9481; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9482; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6]
9483; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
9484; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
9485; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9486; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9487; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9488; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9489; AVX2-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
9490; AVX2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
9491; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9492; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9493; AVX2-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
9494; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9495; AVX2-NEXT:    vmovaps 272(%rdi), %xmm0
9496; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9497; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
9498; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
9499; AVX2-NEXT:    vpermps %ymm14, %ymm6, %ymm2
9500; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
9501; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9502; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9503; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9504; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9505; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
9506; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9507; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9508; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
9509; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9510; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9511; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
9512; AVX2-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
9513; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9514; AVX2-NEXT:    vmovaps 464(%rdi), %xmm0
9515; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9516; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
9517; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
9518; AVX2-NEXT:    vpermps %ymm12, %ymm6, %ymm2
9519; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
9520; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9521; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9522; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9523; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9524; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
9525; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9526; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9527; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9528; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9529; AVX2-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
9530; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9531; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9532; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
9533; AVX2-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
9534; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9535; AVX2-NEXT:    vmovaps 656(%rdi), %xmm0
9536; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9537; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
9538; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
9539; AVX2-NEXT:    vpermps %ymm12, %ymm6, %ymm2
9540; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
9541; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9542; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9543; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9544; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9545; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
9546; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9547; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9548; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7]
9549; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9550; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9551; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
9552; AVX2-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
9553; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9554; AVX2-NEXT:    vmovaps 848(%rdi), %xmm0
9555; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9556; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
9557; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
9558; AVX2-NEXT:    vpermps %ymm5, %ymm6, %ymm2
9559; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
9560; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9561; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9562; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9563; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9564; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
9565; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9566; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9567; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
9568; AVX2-NEXT:    # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7]
9569; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9570; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9571; AVX2-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
9572; AVX2-NEXT:    vmovaps 1040(%rdi), %xmm13
9573; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7]
9574; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
9575; AVX2-NEXT:    vpermps %ymm14, %ymm6, %ymm2
9576; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
9577; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9578; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
9579; AVX2-NEXT:    # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9580; AVX2-NEXT:    vpermps %ymm12, %ymm3, %ymm2
9581; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9582; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9583; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm10[4,5,6,7]
9584; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9585; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
9586; AVX2-NEXT:    # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7]
9587; AVX2-NEXT:    vmovaps 1232(%rdi), %xmm9
9588; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7]
9589; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
9590; AVX2-NEXT:    vpermps %ymm10, %ymm6, %ymm2
9591; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
9592; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9593; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
9594; AVX2-NEXT:    # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9595; AVX2-NEXT:    vpermps %ymm8, %ymm3, %ymm2
9596; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
9597; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9598; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7]
9599; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9600; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
9601; AVX2-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
9602; AVX2-NEXT:    vmovaps 1424(%rdi), %xmm4
9603; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7]
9604; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
9605; AVX2-NEXT:    vpermps %ymm5, %ymm6, %ymm2
9606; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
9607; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9608; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9609; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
9610; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm3
9611; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
9612; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9613; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
9614; AVX2-NEXT:    # ymm0 = mem[1,1,1,1,5,5,5,5]
9615; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9616; AVX2-NEXT:    # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
9617; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
9618; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7]
9619; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
9620; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
9621; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
9622; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
9623; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9624; AVX2-NEXT:    vpermilps $85, (%rsp), %ymm3 # 32-byte Folded Reload
9625; AVX2-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
9626; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
9627; AVX2-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
9628; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
9629; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
9630; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
9631; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
9632; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9633; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9634; AVX2-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
9635; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
9636; AVX2-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
9637; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
9638; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
9639; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
9640; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
9641; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9642; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9643; AVX2-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
9644; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
9645; AVX2-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
9646; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
9647; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
9648; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
9649; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
9650; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9651; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
9652; AVX2-NEXT:    # ymm6 = mem[1,1,1,1,5,5,5,5]
9653; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
9654; AVX2-NEXT:    # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7]
9655; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
9656; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7]
9657; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
9658; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7]
9659; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5]
9660; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7]
9661; AVX2-NEXT:    vpermps %ymm14, %ymm1, %ymm13
9662; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5,6,7]
9663; AVX2-NEXT:    vpermps %ymm12, %ymm0, %ymm12
9664; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7]
9665; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5]
9666; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7]
9667; AVX2-NEXT:    vpermps %ymm10, %ymm1, %ymm9
9668; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5,6,7]
9669; AVX2-NEXT:    vpermps %ymm8, %ymm0, %ymm8
9670; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7]
9671; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5]
9672; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
9673; AVX2-NEXT:    vpermps %ymm5, %ymm1, %ymm1
9674; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
9675; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm0
9676; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
9677; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9678; AVX2-NEXT:    vmovaps %ymm1, 192(%rsi)
9679; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9680; AVX2-NEXT:    vmovaps %ymm1, 128(%rsi)
9681; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9682; AVX2-NEXT:    vmovaps %ymm1, 64(%rsi)
9683; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9684; AVX2-NEXT:    vmovaps %ymm1, (%rsi)
9685; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9686; AVX2-NEXT:    vmovaps %ymm1, 224(%rsi)
9687; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9688; AVX2-NEXT:    vmovaps %ymm1, 160(%rsi)
9689; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9690; AVX2-NEXT:    vmovaps %ymm1, 96(%rsi)
9691; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9692; AVX2-NEXT:    vmovaps %ymm1, 32(%rsi)
9693; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9694; AVX2-NEXT:    vmovaps %ymm1, 192(%rdx)
9695; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9696; AVX2-NEXT:    vmovaps %ymm1, 128(%rdx)
9697; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9698; AVX2-NEXT:    vmovaps %ymm1, 64(%rdx)
9699; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9700; AVX2-NEXT:    vmovaps %ymm1, (%rdx)
9701; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9702; AVX2-NEXT:    vmovaps %ymm1, 224(%rdx)
9703; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9704; AVX2-NEXT:    vmovaps %ymm1, 160(%rdx)
9705; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9706; AVX2-NEXT:    vmovaps %ymm1, 96(%rdx)
9707; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9708; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
9709; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9710; AVX2-NEXT:    vmovaps %ymm1, 192(%rcx)
9711; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9712; AVX2-NEXT:    vmovaps %ymm1, 128(%rcx)
9713; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9714; AVX2-NEXT:    vmovaps %ymm1, 64(%rcx)
9715; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9716; AVX2-NEXT:    vmovaps %ymm1, (%rcx)
9717; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9718; AVX2-NEXT:    vmovaps %ymm1, 224(%rcx)
9719; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9720; AVX2-NEXT:    vmovaps %ymm1, 160(%rcx)
9721; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9722; AVX2-NEXT:    vmovaps %ymm1, 96(%rcx)
9723; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9724; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
9725; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9726; AVX2-NEXT:    vmovaps %ymm1, (%r8)
9727; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9728; AVX2-NEXT:    vmovaps %ymm1, 64(%r8)
9729; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9730; AVX2-NEXT:    vmovaps %ymm1, 128(%r8)
9731; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9732; AVX2-NEXT:    vmovaps %ymm1, 192(%r8)
9733; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9734; AVX2-NEXT:    vmovaps %ymm1, 224(%r8)
9735; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9736; AVX2-NEXT:    vmovaps %ymm1, 160(%r8)
9737; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9738; AVX2-NEXT:    vmovaps %ymm1, 96(%r8)
9739; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9740; AVX2-NEXT:    vmovaps %ymm1, 32(%r8)
9741; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9742; AVX2-NEXT:    vmovaps %ymm1, 224(%r9)
9743; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9744; AVX2-NEXT:    vmovaps %ymm1, 192(%r9)
9745; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9746; AVX2-NEXT:    vmovaps %ymm1, 160(%r9)
9747; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9748; AVX2-NEXT:    vmovaps %ymm1, 128(%r9)
9749; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9750; AVX2-NEXT:    vmovaps %ymm1, 96(%r9)
9751; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9752; AVX2-NEXT:    vmovaps %ymm1, 64(%r9)
9753; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9754; AVX2-NEXT:    vmovaps %ymm1, 32(%r9)
9755; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9756; AVX2-NEXT:    vmovaps %ymm1, (%r9)
9757; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9758; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
9759; AVX2-NEXT:    vmovaps %ymm8, 192(%rax)
9760; AVX2-NEXT:    vmovaps %ymm12, 160(%rax)
9761; AVX2-NEXT:    vmovaps %ymm6, 128(%rax)
9762; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9763; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
9764; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9765; AVX2-NEXT:    vmovaps %ymm0, 64(%rax)
9766; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9767; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
9768; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9769; AVX2-NEXT:    vmovaps %ymm0, (%rax)
9770; AVX2-NEXT:    addq $2568, %rsp # imm = 0xA08
9771; AVX2-NEXT:    vzeroupper
9772; AVX2-NEXT:    retq
9773;
9774; AVX2-FP-LABEL: load_i32_stride6_vf64:
9775; AVX2-FP:       # %bb.0:
9776; AVX2-FP-NEXT:    subq $2568, %rsp # imm = 0xA08
9777; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm4
9778; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm5
9779; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9780; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm3
9781; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9782; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm6
9783; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9784; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm7
9785; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9786; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm2
9787; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9788; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm8
9789; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9790; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
9791; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9792; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm1
9793; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9794; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm9 = [0,6,4,u]
9795; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9796; AVX2-FP-NEXT:    vpermps %ymm14, %ymm9, %ymm0
9797; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1]
9798; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7]
9799; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6]
9800; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9801; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
9802; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9803; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2]
9804; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm2
9805; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9806; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9807; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9808; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm4[0,1]
9809; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7]
9810; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9811; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm0
9812; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9813; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
9814; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9815; AVX2-FP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
9816; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6]
9817; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9818; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm1
9819; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9820; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm2
9821; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9822; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9823; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9824; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm2
9825; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9826; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9827; AVX2-FP-NEXT:    vmovaps 1056(%rdi), %ymm1
9828; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9829; AVX2-FP-NEXT:    vmovaps 1024(%rdi), %ymm0
9830; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9831; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9832; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9833; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9834; AVX2-FP-NEXT:    vmovaps 992(%rdi), %ymm0
9835; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9836; AVX2-FP-NEXT:    vmovaps 960(%rdi), %ymm1
9837; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9838; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9839; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9840; AVX2-FP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
9841; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
9842; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9843; AVX2-FP-NEXT:    vmovaps 1088(%rdi), %ymm1
9844; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9845; AVX2-FP-NEXT:    vmovaps 1120(%rdi), %ymm2
9846; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9847; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9848; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9849; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm2
9850; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9851; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9852; AVX2-FP-NEXT:    vmovaps 1440(%rdi), %ymm1
9853; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9854; AVX2-FP-NEXT:    vmovaps 1408(%rdi), %ymm0
9855; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9856; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9857; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9858; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9859; AVX2-FP-NEXT:    vmovaps 1376(%rdi), %ymm0
9860; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9861; AVX2-FP-NEXT:    vmovaps 1344(%rdi), %ymm1
9862; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9863; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9864; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9865; AVX2-FP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
9866; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
9867; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9868; AVX2-FP-NEXT:    vmovaps 1472(%rdi), %ymm1
9869; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9870; AVX2-FP-NEXT:    vmovaps 1504(%rdi), %ymm2
9871; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9872; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9873; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9874; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm2
9875; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9876; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9877; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm1
9878; AVX2-FP-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
9879; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm0
9880; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9881; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9882; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9883; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9884; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
9885; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9886; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
9887; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9888; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9889; AVX2-FP-NEXT:    vpermps %ymm13, %ymm9, %ymm0
9890; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
9891; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9892; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
9893; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9894; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm2
9895; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9896; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9897; AVX2-FP-NEXT:    vpermps %ymm12, %ymm6, %ymm2
9898; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9899; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9900; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm1
9901; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9902; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm0
9903; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9904; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9905; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9906; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm0
9907; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9908; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm1
9909; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9910; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9911; AVX2-FP-NEXT:    vpermps %ymm10, %ymm9, %ymm0
9912; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[0,2,2,2,4,6,6,6]
9913; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9914; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm1
9915; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9916; AVX2-FP-NEXT:    vmovaps 544(%rdi), %ymm2
9917; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9918; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9919; AVX2-FP-NEXT:    vpermps %ymm8, %ymm6, %ymm2
9920; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9921; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9922; AVX2-FP-NEXT:    vmovaps 864(%rdi), %ymm1
9923; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9924; AVX2-FP-NEXT:    vmovaps 832(%rdi), %ymm0
9925; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9926; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
9927; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9928; AVX2-FP-NEXT:    vmovaps 800(%rdi), %ymm0
9929; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9930; AVX2-FP-NEXT:    vmovaps 768(%rdi), %ymm1
9931; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9932; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9933; AVX2-FP-NEXT:    vpermps %ymm4, %ymm9, %ymm0
9934; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6]
9935; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9936; AVX2-FP-NEXT:    vmovaps 896(%rdi), %ymm1
9937; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9938; AVX2-FP-NEXT:    vmovaps 928(%rdi), %ymm2
9939; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9940; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9941; AVX2-FP-NEXT:    vpermps %ymm3, %ymm6, %ymm2
9942; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9943; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9944; AVX2-FP-NEXT:    vmovaps 1184(%rdi), %ymm0
9945; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9946; AVX2-FP-NEXT:    vmovaps 1152(%rdi), %ymm1
9947; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9948; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9949; AVX2-FP-NEXT:    vpermps %ymm2, %ymm9, %ymm0
9950; AVX2-FP-NEXT:    vmovaps 1248(%rdi), %ymm1
9951; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9952; AVX2-FP-NEXT:    vmovaps 1216(%rdi), %ymm5
9953; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9954; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm1[0,1]
9955; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7]
9956; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm9[0,2,2,2,4,6,6,6]
9957; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
9958; AVX2-FP-NEXT:    vmovaps 1280(%rdi), %ymm1
9959; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9960; AVX2-FP-NEXT:    vmovaps 1312(%rdi), %ymm5
9961; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9962; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
9963; AVX2-FP-NEXT:    vpermps %ymm5, %ymm6, %ymm1
9964; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9965; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9966; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
9967; AVX2-FP-NEXT:    vpermps %ymm14, %ymm0, %ymm1
9968; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7]
9969; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7]
9970; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3]
9971; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
9972; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
9973; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9974; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9975; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
9976; AVX2-FP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
9977; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
9978; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
9979; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
9980; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9981; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9982; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
9983; AVX2-FP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
9984; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
9985; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
9986; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
9987; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9988; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
9989; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
9990; AVX2-FP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
9991; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
9992; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
9993; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
9994; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9995; AVX2-FP-NEXT:    vpermps %ymm13, %ymm0, %ymm13
9996; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
9997; AVX2-FP-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
9998; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
9999; AVX2-FP-NEXT:    vpermps %ymm12, %ymm1, %ymm12
10000; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
10001; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10002; AVX2-FP-NEXT:    vpermps %ymm10, %ymm0, %ymm10
10003; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
10004; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7]
10005; AVX2-FP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
10006; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
10007; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10008; AVX2-FP-NEXT:    vpermps %ymm4, %ymm0, %ymm4
10009; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7]
10010; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7]
10011; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm3
10012; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
10013; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10014; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
10015; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[1,3,2,3,5,7,6,7]
10016; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10017; AVX2-FP-NEXT:    vpermps %ymm5, %ymm1, %ymm2
10018; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10019; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10020; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10021; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10022; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
10023; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
10024; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10025; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10026; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10027; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10028; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10029; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
10030; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10031; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10032; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10033; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10034; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10035; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10036; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
10037; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10038; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10039; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10040; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10041; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10042; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
10043; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
10044; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10045; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10046; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10047; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10048; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10049; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
10050; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10051; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10052; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10053; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10054; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10055; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10056; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
10057; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10058; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10059; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10060; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10061; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
10062; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
10063; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
10064; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10065; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10066; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10067; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10068; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10069; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
10070; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10071; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10072; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10073; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10074; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10075; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10076; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
10077; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10078; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10079; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10080; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10081; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10082; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
10083; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
10084; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10085; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10086; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10087; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10088; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10089; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
10090; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10091; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10092; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10093; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10094; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10095; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10096; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
10097; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10098; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10099; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10100; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
10101; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10102; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
10103; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
10104; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10105; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10106; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10107; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10108; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10109; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
10110; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10111; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10112; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10113; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10114; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10115; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10116; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
10117; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10118; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10119; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10120; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10121; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10122; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
10123; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
10124; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[0,2,0,3]
10125; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10126; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10127; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
10128; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10129; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[2,0,2,3,6,4,6,7]
10130; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
10131; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
10132; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10133; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10134; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
10135; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10136; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0,2,0,4,4,6,4]
10137; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
10138; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
10139; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10140; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10141; AVX2-FP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
10142; AVX2-FP-NEXT:    # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
10143; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
10144; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
10145; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10146; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
10147; AVX2-FP-NEXT:    # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
10148; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[2,0,2,3,6,4,6,7]
10149; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
10150; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
10151; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10152; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
10153; AVX2-FP-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
10154; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[0,0,2,0,4,4,6,4]
10155; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
10156; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
10157; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10158; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10159; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10160; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7]
10161; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
10162; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[0,2,0,3]
10163; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10164; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10165; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
10166; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[2,0,2,3,6,4,6,7]
10167; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
10168; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
10169; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10170; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
10171; AVX2-FP-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
10172; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[0,0,2,0,4,4,6,4]
10173; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
10174; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
10175; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10176; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
10177; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
10178; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10179; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10180; AVX2-FP-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
10181; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
10182; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
10183; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10184; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
10185; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
10186; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
10187; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10188; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
10189; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
10190; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10191; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10192; AVX2-FP-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
10193; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
10194; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
10195; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10196; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
10197; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
10198; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
10199; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10200; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7]
10201; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10202; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
10203; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10204; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10205; AVX2-FP-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
10206; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
10207; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
10208; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10209; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
10210; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
10211; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
10212; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10213; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10214; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
10215; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10216; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
10217; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10218; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10219; AVX2-FP-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
10220; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
10221; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
10222; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10223; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
10224; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
10225; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
10226; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10227; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
10228; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7]
10229; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10230; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,1,3,3,7,5,7,7]
10231; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10232; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10233; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,7,5]
10234; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10235; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10236; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10237; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10238; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
10239; AVX2-FP-NEXT:    vmovaps %ymm13, %ymm5
10240; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
10241; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10242; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[3,1,3,3,7,5,7,7]
10243; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10244; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10245; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,3,1,4,5,7,5]
10246; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10247; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10248; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10249; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10250; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[3,3,3,3,7,7,7,7]
10251; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10252; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
10253; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10254; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10255; AVX2-FP-NEXT:    # ymm2 = mem[3,1,3,3,7,5,7,7]
10256; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10257; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10258; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10259; AVX2-FP-NEXT:    # ymm2 = mem[0,1,3,1,4,5,7,5]
10260; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10261; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10262; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10263; AVX2-FP-NEXT:    vmovups (%rsp), %ymm14 # 32-byte Reload
10264; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[3,3,3,3,7,7,7,7]
10265; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10266; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
10267; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
10268; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10269; AVX2-FP-NEXT:    # ymm2 = mem[3,1,3,3,7,5,7,7]
10270; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
10271; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10272; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10273; AVX2-FP-NEXT:    # ymm2 = mem[0,1,3,1,4,5,7,5]
10274; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
10275; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10276; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10277; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm14[4,5,6,7]
10278; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10279; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10280; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10281; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7]
10282; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10283; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm0
10284; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10285; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
10286; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
10287; AVX2-FP-NEXT:    vpermps %ymm3, %ymm6, %ymm2
10288; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
10289; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10290; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10291; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10292; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10293; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6]
10294; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
10295; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
10296; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10297; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10298; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10299; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10300; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
10301; AVX2-FP-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
10302; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10303; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
10304; AVX2-FP-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
10305; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10306; AVX2-FP-NEXT:    vmovaps 272(%rdi), %xmm0
10307; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10308; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
10309; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
10310; AVX2-FP-NEXT:    vpermps %ymm14, %ymm6, %ymm2
10311; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
10312; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10313; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10314; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10315; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10316; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
10317; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10318; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10319; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10320; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10321; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10322; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
10323; AVX2-FP-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
10324; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10325; AVX2-FP-NEXT:    vmovaps 464(%rdi), %xmm0
10326; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10327; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
10328; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
10329; AVX2-FP-NEXT:    vpermps %ymm12, %ymm6, %ymm2
10330; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
10331; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10332; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10333; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10334; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10335; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
10336; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10337; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10338; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10339; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10340; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
10341; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10342; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10343; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
10344; AVX2-FP-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
10345; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10346; AVX2-FP-NEXT:    vmovaps 656(%rdi), %xmm0
10347; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10348; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
10349; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
10350; AVX2-FP-NEXT:    vpermps %ymm12, %ymm6, %ymm2
10351; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
10352; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10353; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10354; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10355; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10356; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
10357; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10358; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10359; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7]
10360; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10361; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10362; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
10363; AVX2-FP-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
10364; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10365; AVX2-FP-NEXT:    vmovaps 848(%rdi), %xmm0
10366; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10367; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
10368; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
10369; AVX2-FP-NEXT:    vpermps %ymm5, %ymm6, %ymm2
10370; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
10371; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10372; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10373; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10374; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10375; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
10376; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10377; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10378; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
10379; AVX2-FP-NEXT:    # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7]
10380; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10381; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
10382; AVX2-FP-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
10383; AVX2-FP-NEXT:    vmovaps 1040(%rdi), %xmm13
10384; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7]
10385; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
10386; AVX2-FP-NEXT:    vpermps %ymm14, %ymm6, %ymm2
10387; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
10388; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10389; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
10390; AVX2-FP-NEXT:    # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10391; AVX2-FP-NEXT:    vpermps %ymm12, %ymm3, %ymm2
10392; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10393; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10394; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm10[4,5,6,7]
10395; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10396; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10397; AVX2-FP-NEXT:    # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7]
10398; AVX2-FP-NEXT:    vmovaps 1232(%rdi), %xmm9
10399; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7]
10400; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
10401; AVX2-FP-NEXT:    vpermps %ymm10, %ymm6, %ymm2
10402; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
10403; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10404; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
10405; AVX2-FP-NEXT:    # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10406; AVX2-FP-NEXT:    vpermps %ymm8, %ymm3, %ymm2
10407; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10408; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10409; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7]
10410; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10411; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
10412; AVX2-FP-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
10413; AVX2-FP-NEXT:    vmovaps 1424(%rdi), %xmm4
10414; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7]
10415; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
10416; AVX2-FP-NEXT:    vpermps %ymm5, %ymm6, %ymm2
10417; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
10418; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10419; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10420; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10421; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm3
10422; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
10423; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10424; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
10425; AVX2-FP-NEXT:    # ymm0 = mem[1,1,1,1,5,5,5,5]
10426; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10427; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
10428; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
10429; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7]
10430; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
10431; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
10432; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
10433; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
10434; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10435; AVX2-FP-NEXT:    vpermilps $85, (%rsp), %ymm3 # 32-byte Folded Reload
10436; AVX2-FP-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
10437; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
10438; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
10439; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
10440; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
10441; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
10442; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
10443; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10444; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10445; AVX2-FP-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
10446; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
10447; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
10448; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
10449; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
10450; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
10451; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
10452; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10453; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10454; AVX2-FP-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
10455; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
10456; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
10457; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
10458; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
10459; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
10460; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
10461; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10462; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
10463; AVX2-FP-NEXT:    # ymm6 = mem[1,1,1,1,5,5,5,5]
10464; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10465; AVX2-FP-NEXT:    # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7]
10466; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
10467; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7]
10468; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
10469; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7]
10470; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5]
10471; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7]
10472; AVX2-FP-NEXT:    vpermps %ymm14, %ymm1, %ymm13
10473; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5,6,7]
10474; AVX2-FP-NEXT:    vpermps %ymm12, %ymm0, %ymm12
10475; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7]
10476; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5]
10477; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7]
10478; AVX2-FP-NEXT:    vpermps %ymm10, %ymm1, %ymm9
10479; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5,6,7]
10480; AVX2-FP-NEXT:    vpermps %ymm8, %ymm0, %ymm8
10481; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7]
10482; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5]
10483; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
10484; AVX2-FP-NEXT:    vpermps %ymm5, %ymm1, %ymm1
10485; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
10486; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
10487; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
10488; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10489; AVX2-FP-NEXT:    vmovaps %ymm1, 192(%rsi)
10490; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10491; AVX2-FP-NEXT:    vmovaps %ymm1, 128(%rsi)
10492; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10493; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rsi)
10494; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10495; AVX2-FP-NEXT:    vmovaps %ymm1, (%rsi)
10496; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10497; AVX2-FP-NEXT:    vmovaps %ymm1, 224(%rsi)
10498; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10499; AVX2-FP-NEXT:    vmovaps %ymm1, 160(%rsi)
10500; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10501; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rsi)
10502; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10503; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rsi)
10504; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10505; AVX2-FP-NEXT:    vmovaps %ymm1, 192(%rdx)
10506; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10507; AVX2-FP-NEXT:    vmovaps %ymm1, 128(%rdx)
10508; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10509; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rdx)
10510; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10511; AVX2-FP-NEXT:    vmovaps %ymm1, (%rdx)
10512; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10513; AVX2-FP-NEXT:    vmovaps %ymm1, 224(%rdx)
10514; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10515; AVX2-FP-NEXT:    vmovaps %ymm1, 160(%rdx)
10516; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10517; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rdx)
10518; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10519; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rdx)
10520; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10521; AVX2-FP-NEXT:    vmovaps %ymm1, 192(%rcx)
10522; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10523; AVX2-FP-NEXT:    vmovaps %ymm1, 128(%rcx)
10524; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10525; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rcx)
10526; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10527; AVX2-FP-NEXT:    vmovaps %ymm1, (%rcx)
10528; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10529; AVX2-FP-NEXT:    vmovaps %ymm1, 224(%rcx)
10530; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10531; AVX2-FP-NEXT:    vmovaps %ymm1, 160(%rcx)
10532; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10533; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rcx)
10534; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10535; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rcx)
10536; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10537; AVX2-FP-NEXT:    vmovaps %ymm1, (%r8)
10538; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10539; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%r8)
10540; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10541; AVX2-FP-NEXT:    vmovaps %ymm1, 128(%r8)
10542; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10543; AVX2-FP-NEXT:    vmovaps %ymm1, 192(%r8)
10544; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10545; AVX2-FP-NEXT:    vmovaps %ymm1, 224(%r8)
10546; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10547; AVX2-FP-NEXT:    vmovaps %ymm1, 160(%r8)
10548; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10549; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%r8)
10550; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10551; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%r8)
10552; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10553; AVX2-FP-NEXT:    vmovaps %ymm1, 224(%r9)
10554; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10555; AVX2-FP-NEXT:    vmovaps %ymm1, 192(%r9)
10556; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10557; AVX2-FP-NEXT:    vmovaps %ymm1, 160(%r9)
10558; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10559; AVX2-FP-NEXT:    vmovaps %ymm1, 128(%r9)
10560; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10561; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%r9)
10562; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10563; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%r9)
10564; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10565; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%r9)
10566; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10567; AVX2-FP-NEXT:    vmovaps %ymm1, (%r9)
10568; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10569; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
10570; AVX2-FP-NEXT:    vmovaps %ymm8, 192(%rax)
10571; AVX2-FP-NEXT:    vmovaps %ymm12, 160(%rax)
10572; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%rax)
10573; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10574; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
10575; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10576; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rax)
10577; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10578; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
10579; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10580; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
10581; AVX2-FP-NEXT:    addq $2568, %rsp # imm = 0xA08
10582; AVX2-FP-NEXT:    vzeroupper
10583; AVX2-FP-NEXT:    retq
10584;
10585; AVX2-FCP-LABEL: load_i32_stride6_vf64:
10586; AVX2-FCP:       # %bb.0:
10587; AVX2-FCP-NEXT:    subq $2536, %rsp # imm = 0x9E8
10588; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm5
10589; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm6
10590; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10591; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm3
10592; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10593; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm4
10594; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10595; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm7
10596; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10597; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm2
10598; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10599; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm8
10600; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10601; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm0
10602; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10603; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
10604; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10605; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [0,6,4,u]
10606; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
10607; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm12, %ymm0
10608; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1]
10609; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
10610; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10611; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6]
10612; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10613; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm4[4,5,6,7]
10614; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10615; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2]
10616; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm4, %ymm2
10617; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10618; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10619; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10620; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[0,1],ymm5[0,1]
10621; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm5[6,7]
10622; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10623; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm0
10624; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10625; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
10626; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10627; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
10628; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6]
10629; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10630; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm1
10631; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10632; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm2
10633; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10634; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10635; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10636; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm2
10637; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10638; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10639; AVX2-FCP-NEXT:    vmovaps 1056(%rdi), %ymm1
10640; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10641; AVX2-FCP-NEXT:    vmovaps 1024(%rdi), %ymm0
10642; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10643; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
10644; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10645; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10646; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %ymm0
10647; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10648; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %ymm1
10649; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10650; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
10651; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10652; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
10653; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
10654; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10655; AVX2-FCP-NEXT:    vmovaps 1088(%rdi), %ymm1
10656; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10657; AVX2-FCP-NEXT:    vmovaps 1120(%rdi), %ymm2
10658; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10659; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10660; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10661; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm2
10662; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10663; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10664; AVX2-FCP-NEXT:    vmovaps 1440(%rdi), %ymm1
10665; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10666; AVX2-FCP-NEXT:    vmovaps 1408(%rdi), %ymm0
10667; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10668; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
10669; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10670; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10671; AVX2-FCP-NEXT:    vmovaps 1376(%rdi), %ymm0
10672; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10673; AVX2-FCP-NEXT:    vmovaps 1344(%rdi), %ymm1
10674; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10675; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
10676; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10677; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
10678; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
10679; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10680; AVX2-FCP-NEXT:    vmovaps 1472(%rdi), %ymm1
10681; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10682; AVX2-FCP-NEXT:    vmovaps 1504(%rdi), %ymm2
10683; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10684; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10685; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10686; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm2
10687; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10688; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10689; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
10690; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10691; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm0
10692; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10693; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
10694; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10695; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10696; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
10697; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10698; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
10699; AVX2-FCP-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
10700; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
10701; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm12, %ymm0
10702; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
10703; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10704; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm1
10705; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10706; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm2
10707; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10708; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10709; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm4, %ymm2
10710; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10711; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10712; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm1
10713; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10714; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm0
10715; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10716; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
10717; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10718; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm0
10719; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10720; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm1
10721; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10722; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
10723; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm12, %ymm0
10724; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,2,2,2,4,6,6,6]
10725; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10726; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm1
10727; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10728; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm2
10729; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10730; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10731; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm4, %ymm2
10732; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10733; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10734; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10735; AVX2-FCP-NEXT:    vmovaps 864(%rdi), %ymm1
10736; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10737; AVX2-FCP-NEXT:    vmovaps 832(%rdi), %ymm0
10738; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10739; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
10740; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10741; AVX2-FCP-NEXT:    vmovaps 800(%rdi), %ymm0
10742; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10743; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %ymm1
10744; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10745; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
10746; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm12, %ymm0
10747; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6]
10748; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10749; AVX2-FCP-NEXT:    vmovaps 896(%rdi), %ymm1
10750; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10751; AVX2-FCP-NEXT:    vmovaps 928(%rdi), %ymm2
10752; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10753; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10754; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm4, %ymm2
10755; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10756; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10757; AVX2-FCP-NEXT:    vmovaps 1184(%rdi), %ymm0
10758; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10759; AVX2-FCP-NEXT:    vmovaps 1152(%rdi), %ymm1
10760; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10761; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
10762; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm0
10763; AVX2-FCP-NEXT:    vmovaps 1248(%rdi), %ymm1
10764; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10765; AVX2-FCP-NEXT:    vmovaps 1216(%rdi), %ymm2
10766; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10767; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm1[0,1]
10768; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10769; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,2,2,2,4,6,6,6]
10770; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10771; AVX2-FCP-NEXT:    vmovaps 1280(%rdi), %ymm1
10772; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10773; AVX2-FCP-NEXT:    vmovaps 1312(%rdi), %ymm2
10774; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10775; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10776; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm4, %ymm1
10777; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10778; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10779; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
10780; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm0, %ymm1
10781; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10782; AVX2-FCP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
10783; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7]
10784; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3]
10785; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
10786; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
10787; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10788; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
10789; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10790; AVX2-FCP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
10791; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
10792; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
10793; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
10794; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10795; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
10796; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10797; AVX2-FCP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
10798; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
10799; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
10800; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
10801; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10802; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
10803; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10804; AVX2-FCP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
10805; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
10806; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
10807; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm15[6,7]
10808; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10809; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm0, %ymm13
10810; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10811; AVX2-FCP-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
10812; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
10813; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm1, %ymm11
10814; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm11[6,7]
10815; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10816; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm0, %ymm9
10817; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7]
10818; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
10819; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
10820; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm8[6,7]
10821; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10822; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
10823; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7]
10824; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
10825; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm1, %ymm5
10826; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7]
10827; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10828; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm0
10829; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[1,3,2,3,5,7,6,7]
10830; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
10831; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
10832; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10833; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10834; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10835; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
10836; AVX2-FCP-NEXT:    # ymm0 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
10837; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm13 = [2,0,6,4,2,0,6,7]
10838; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
10839; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [2,0,6,7]
10840; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10841; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10842; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10843; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10844; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
10845; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10846; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10847; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10848; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10849; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10850; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm15 = [0,0,6,4,0,0,6,4]
10851; AVX2-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
10852; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm2
10853; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10854; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10855; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10856; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
10857; AVX2-FCP-NEXT:    # ymm0 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7]
10858; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
10859; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10860; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10861; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10862; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10863; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
10864; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10865; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10866; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10867; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10868; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10869; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm2
10870; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10871; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10872; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10873; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10874; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
10875; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
10876; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10877; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10878; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10879; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10880; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
10881; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10882; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10883; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
10884; AVX2-FCP-NEXT:    # ymm8 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10885; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm15, %ymm2
10886; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10887; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10888; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10889; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
10890; AVX2-FCP-NEXT:    # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
10891; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
10892; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10893; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
10894; AVX2-FCP-NEXT:    # ymm14 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10895; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm12, %ymm2
10896; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10897; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10898; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload
10899; AVX2-FCP-NEXT:    # ymm11 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10900; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm15, %ymm2
10901; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10902; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10903; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10904; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10905; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
10906; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
10907; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10908; AVX2-FCP-NEXT:    vblendps $243, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
10909; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
10910; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10911; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
10912; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10913; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10914; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10915; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10916; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10917; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm2
10918; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10919; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10920; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10921; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10922; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
10923; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
10924; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10925; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10926; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10927; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10928; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
10929; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10930; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10931; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10932; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10933; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10934; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm2
10935; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
10936; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10937; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10938; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
10939; AVX2-FCP-NEXT:    # ymm0 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7]
10940; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
10941; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10942; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload
10943; AVX2-FCP-NEXT:    # ymm9 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
10944; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm12, %ymm2
10945; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10946; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10947; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10948; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
10949; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm5
10950; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
10951; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10952; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10953; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
10954; AVX2-FCP-NEXT:    # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7]
10955; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
10956; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10957; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
10958; AVX2-FCP-NEXT:    # ymm13 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7]
10959; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm12, %ymm5
10960; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3,4,5,6,7]
10961; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10962; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10963; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
10964; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm12
10965; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7]
10966; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10967; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm5 = mem[3,3,3,3]
10968; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7]
10969; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload
10970; AVX2-FCP-NEXT:    # ymm12 = ymm5[0],mem[1,2,3,4],ymm5[5],mem[6,7]
10971; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm5 = [3,1,7,5,0,u,u,u]
10972; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm5, %ymm15
10973; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm12 = [0,1,7,5,0,1,7,5]
10974; AVX2-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
10975; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
10976; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7]
10977; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10978; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm7 = mem[3,3,3,3]
10979; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
10980; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
10981; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1,2,3,4],ymm7[5],mem[6,7]
10982; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
10983; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload
10984; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm15[5,6,7]
10985; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10986; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm7 = mem[3,3,3,3]
10987; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10988; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7]
10989; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
10990; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1,2,3,4],ymm7[5],mem[6,7]
10991; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
10992; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm12, %ymm8
10993; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm8[5,6,7]
10994; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10995; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm7 = mem[3,3,3,3]
10996; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7]
10997; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3,4],ymm7[5],ymm14[6,7]
10998; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
10999; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm6
11000; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm6[5,6,7]
11001; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11002; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm6 = mem[3,3,3,3]
11003; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
11004; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1,2,3,4],ymm6[5],ymm13[6,7]
11005; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm5, %ymm6
11006; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
11007; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7]
11008; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11009; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
11010; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
11011; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1,2,3,4],ymm0[5],ymm9[6,7]
11012; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
11013; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
11014; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11015; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11016; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
11017; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11018; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7]
11019; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11020; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1,2,3,4],ymm0[5],mem[6,7]
11021; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
11022; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
11023; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11024; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11025; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
11026; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm2 = mem[3,3,3,3]
11027; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11028; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7]
11029; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11030; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1,2,3,4],ymm2[5],mem[6,7]
11031; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
11032; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
11033; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11034; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload
11035; AVX2-FCP-NEXT:    # ymm2 = ymm4[0,1,2,3],mem[4,5,6,7]
11036; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11037; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11038; AVX2-FCP-NEXT:    vblendps $15, (%rsp), %ymm0, %ymm5 # 32-byte Folded Reload
11039; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3],ymm0[4,5,6,7]
11040; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11041; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm0
11042; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
11043; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
11044; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
11045; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11046; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm4, %ymm2
11047; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
11048; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11049; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11050; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
11051; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11052; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6]
11053; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
11054; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
11055; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11056; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11057; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11058; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
11059; AVX2-FCP-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
11060; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11061; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11062; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
11063; AVX2-FCP-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
11064; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11065; AVX2-FCP-NEXT:    vmovaps 272(%rdi), %xmm0
11066; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11067; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
11068; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
11069; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm4, %ymm2
11070; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
11071; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11072; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11073; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
11074; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11075; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
11076; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11077; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11078; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
11079; AVX2-FCP-NEXT:    # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7]
11080; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11081; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11082; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
11083; AVX2-FCP-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
11084; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11085; AVX2-FCP-NEXT:    vmovaps 464(%rdi), %xmm0
11086; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11087; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
11088; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
11089; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm2
11090; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
11091; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11092; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11093; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
11094; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11095; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
11096; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11097; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11098; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11099; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
11100; AVX2-FCP-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
11101; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11102; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11103; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
11104; AVX2-FCP-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
11105; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11106; AVX2-FCP-NEXT:    vmovaps 656(%rdi), %xmm0
11107; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11108; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
11109; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
11110; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm2
11111; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
11112; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11113; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11114; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
11115; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11116; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
11117; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11118; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11119; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
11120; AVX2-FCP-NEXT:    # ymm2 = ymm10[0,1,2,3],mem[4,5,6,7]
11121; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11122; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11123; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
11124; AVX2-FCP-NEXT:    # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7]
11125; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11126; AVX2-FCP-NEXT:    vmovaps 848(%rdi), %xmm0
11127; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11128; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
11129; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
11130; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm4, %ymm2
11131; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
11132; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11133; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11134; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
11135; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11136; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
11137; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11138; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11139; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11140; AVX2-FCP-NEXT:    # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7]
11141; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11142; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
11143; AVX2-FCP-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
11144; AVX2-FCP-NEXT:    vmovaps 1040(%rdi), %xmm13
11145; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7]
11146; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
11147; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm4, %ymm2
11148; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
11149; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11150; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
11151; AVX2-FCP-NEXT:    # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
11152; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm5, %ymm2
11153; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11154; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11155; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
11156; AVX2-FCP-NEXT:    # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
11157; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11158; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
11159; AVX2-FCP-NEXT:    # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7]
11160; AVX2-FCP-NEXT:    vmovaps 1232(%rdi), %xmm9
11161; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7]
11162; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
11163; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm4, %ymm2
11164; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
11165; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11166; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
11167; AVX2-FCP-NEXT:    # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
11168; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm5, %ymm2
11169; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11170; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11171; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11172; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
11173; AVX2-FCP-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
11174; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11175; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
11176; AVX2-FCP-NEXT:    # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7]
11177; AVX2-FCP-NEXT:    vmovaps 1424(%rdi), %xmm3
11178; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm3[2,3],ymm7[4,5,6,7]
11179; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
11180; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm4, %ymm2
11181; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
11182; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11183; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11184; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
11185; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm4
11186; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
11187; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11188; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
11189; AVX2-FCP-NEXT:    # ymm0 = mem[1,1,1,1,5,5,5,5]
11190; AVX2-FCP-NEXT:    vblendps $8, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
11191; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
11192; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
11193; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3,4,5,6,7]
11194; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
11195; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
11196; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11197; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
11198; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11199; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11200; AVX2-FCP-NEXT:    # ymm4 = mem[1,1,1,1,5,5,5,5]
11201; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11202; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
11203; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
11204; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
11205; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11206; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
11207; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11208; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11209; AVX2-FCP-NEXT:    # ymm4 = mem[1,1,1,1,5,5,5,5]
11210; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11211; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
11212; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
11213; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
11214; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11215; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
11216; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11217; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11218; AVX2-FCP-NEXT:    # ymm4 = mem[1,1,1,1,5,5,5,5]
11219; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11220; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
11221; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
11222; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
11223; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11224; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
11225; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11226; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11227; AVX2-FCP-NEXT:    # ymm4 = mem[1,1,1,1,5,5,5,5]
11228; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11229; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
11230; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
11231; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
11232; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11233; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
11234; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5]
11235; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7]
11236; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm1, %ymm13
11237; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5,6,7]
11238; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm0, %ymm12
11239; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4],ymm12[5,6,7]
11240; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm11[1,1,1,1,5,5,5,5]
11241; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3],ymm5[4,5,6,7]
11242; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm1, %ymm9
11243; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7]
11244; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm0, %ymm8
11245; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7]
11246; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5]
11247; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7]
11248; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm1, %ymm1
11249; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
11250; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
11251; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
11252; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11253; AVX2-FCP-NEXT:    vmovaps %ymm1, 192(%rsi)
11254; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11255; AVX2-FCP-NEXT:    vmovaps %ymm1, 128(%rsi)
11256; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11257; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rsi)
11258; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11259; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rsi)
11260; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11261; AVX2-FCP-NEXT:    vmovaps %ymm1, 224(%rsi)
11262; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11263; AVX2-FCP-NEXT:    vmovaps %ymm1, 160(%rsi)
11264; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11265; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%rsi)
11266; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11267; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rsi)
11268; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11269; AVX2-FCP-NEXT:    vmovaps %ymm1, 192(%rdx)
11270; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11271; AVX2-FCP-NEXT:    vmovaps %ymm1, 128(%rdx)
11272; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11273; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rdx)
11274; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11275; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rdx)
11276; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11277; AVX2-FCP-NEXT:    vmovaps %ymm1, 224(%rdx)
11278; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11279; AVX2-FCP-NEXT:    vmovaps %ymm1, 160(%rdx)
11280; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11281; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%rdx)
11282; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11283; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rdx)
11284; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11285; AVX2-FCP-NEXT:    vmovaps %ymm1, 192(%rcx)
11286; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11287; AVX2-FCP-NEXT:    vmovaps %ymm1, 128(%rcx)
11288; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11289; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rcx)
11290; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11291; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rcx)
11292; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11293; AVX2-FCP-NEXT:    vmovaps %ymm1, 224(%rcx)
11294; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11295; AVX2-FCP-NEXT:    vmovaps %ymm1, 160(%rcx)
11296; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11297; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%rcx)
11298; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11299; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rcx)
11300; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11301; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r8)
11302; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11303; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%r8)
11304; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11305; AVX2-FCP-NEXT:    vmovaps %ymm1, 128(%r8)
11306; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11307; AVX2-FCP-NEXT:    vmovaps %ymm1, 192(%r8)
11308; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11309; AVX2-FCP-NEXT:    vmovaps %ymm1, 224(%r8)
11310; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11311; AVX2-FCP-NEXT:    vmovaps %ymm1, 160(%r8)
11312; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11313; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%r8)
11314; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11315; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%r8)
11316; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11317; AVX2-FCP-NEXT:    vmovaps %ymm1, 224(%r9)
11318; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11319; AVX2-FCP-NEXT:    vmovaps %ymm1, 192(%r9)
11320; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11321; AVX2-FCP-NEXT:    vmovaps %ymm1, 160(%r9)
11322; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11323; AVX2-FCP-NEXT:    vmovaps %ymm1, 128(%r9)
11324; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11325; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%r9)
11326; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11327; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%r9)
11328; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11329; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%r9)
11330; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11331; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r9)
11332; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11333; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
11334; AVX2-FCP-NEXT:    vmovaps %ymm8, 192(%rax)
11335; AVX2-FCP-NEXT:    vmovaps %ymm12, 160(%rax)
11336; AVX2-FCP-NEXT:    vmovaps %ymm4, 128(%rax)
11337; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11338; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rax)
11339; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11340; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rax)
11341; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11342; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
11343; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11344; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
11345; AVX2-FCP-NEXT:    addq $2536, %rsp # imm = 0x9E8
11346; AVX2-FCP-NEXT:    vzeroupper
11347; AVX2-FCP-NEXT:    retq
11348;
11349; AVX512-LABEL: load_i32_stride6_vf64:
11350; AVX512:       # %bb.0:
11351; AVX512-NEXT:    subq $2632, %rsp # imm = 0xA48
11352; AVX512-NEXT:    vmovdqa64 1472(%rdi), %zmm21
11353; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm1
11354; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm20
11355; AVX512-NEXT:    vmovdqa64 1024(%rdi), %zmm0
11356; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm2
11357; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm18
11358; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm27
11359; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm3
11360; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm25
11361; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm4
11362; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
11363; AVX512-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
11364; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
11365; AVX512-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
11366; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm7
11367; AVX512-NEXT:    vpermt2d %zmm25, %zmm6, %zmm7
11368; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11369; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm7
11370; AVX512-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
11371; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11372; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm7
11373; AVX512-NEXT:    vpermt2d %zmm2, %zmm5, %zmm7
11374; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11375; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm7
11376; AVX512-NEXT:    vpermt2d %zmm20, %zmm6, %zmm7
11377; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11378; AVX512-NEXT:    vpermi2d %zmm21, %zmm1, %zmm6
11379; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11380; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
11381; AVX512-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
11382; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm8
11383; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm6
11384; AVX512-NEXT:    vpermt2d %zmm20, %zmm7, %zmm8
11385; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11386; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
11387; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
11388; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0
11389; AVX512-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
11390; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11391; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm0
11392; AVX512-NEXT:    vpermt2d %zmm25, %zmm7, %zmm0
11393; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11394; AVX512-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
11395; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11396; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
11397; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11398; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm7
11399; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
11400; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11401; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm7
11402; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
11403; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11404; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm7
11405; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
11406; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11407; AVX512-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
11408; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11409; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
11410; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11411; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm7
11412; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
11413; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11414; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm7
11415; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
11416; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11417; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm7
11418; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
11419; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11420; AVX512-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
11421; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11422; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
11423; AVX512-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
11424; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0
11425; AVX512-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
11426; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11427; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
11428; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11429; AVX512-NEXT:    vpermt2d %zmm27, %zmm0, %zmm3
11430; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11431; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm3
11432; AVX512-NEXT:    vpermt2d %zmm25, %zmm7, %zmm3
11433; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11434; AVX512-NEXT:    vpermt2d %zmm25, %zmm0, %zmm4
11435; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11436; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm3
11437; AVX512-NEXT:    vpermt2d %zmm20, %zmm7, %zmm3
11438; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11439; AVX512-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
11440; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11441; AVX512-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
11442; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11443; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm1
11444; AVX512-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
11445; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11446; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
11447; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11448; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
11449; AVX512-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
11450; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
11451; AVX512-NEXT:    vpermt2d %zmm18, %zmm31, %zmm0
11452; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11453; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
11454; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
11455; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
11456; AVX512-NEXT:    vpermt2d %zmm18, %zmm25, %zmm0
11457; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11458; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
11459; AVX512-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
11460; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
11461; AVX512-NEXT:    vpermt2d %zmm18, %zmm3, %zmm0
11462; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11463; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
11464; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11465; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
11466; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11467; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm26
11468; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm1
11469; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
11470; AVX512-NEXT:    vpermt2d %zmm26, %zmm5, %zmm2
11471; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11472; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
11473; AVX512-NEXT:    vpermt2d %zmm26, %zmm8, %zmm2
11474; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11475; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm2
11476; AVX512-NEXT:    vpermt2d %zmm1, %zmm31, %zmm2
11477; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11478; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm2
11479; AVX512-NEXT:    vpermt2d %zmm1, %zmm25, %zmm2
11480; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11481; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm2
11482; AVX512-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
11483; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11484; AVX512-NEXT:    vpermt2d %zmm1, %zmm0, %zmm26
11485; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm22
11486; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm1
11487; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
11488; AVX512-NEXT:    vpermt2d %zmm22, %zmm5, %zmm2
11489; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11490; AVX512-NEXT:    vmovdqa64 1280(%rdi), %zmm19
11491; AVX512-NEXT:    vmovdqa64 1344(%rdi), %zmm2
11492; AVX512-NEXT:    vpermi2d %zmm19, %zmm2, %zmm5
11493; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11494; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4
11495; AVX512-NEXT:    vpermt2d %zmm22, %zmm8, %zmm4
11496; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11497; AVX512-NEXT:    vpermi2d %zmm19, %zmm2, %zmm8
11498; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11499; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm28
11500; AVX512-NEXT:    vpermt2d %zmm1, %zmm31, %zmm28
11501; AVX512-NEXT:    vpermi2d %zmm2, %zmm19, %zmm31
11502; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm29
11503; AVX512-NEXT:    vpermt2d %zmm1, %zmm25, %zmm29
11504; AVX512-NEXT:    vpermi2d %zmm2, %zmm19, %zmm25
11505; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm4
11506; AVX512-NEXT:    vpermt2d %zmm1, %zmm3, %zmm4
11507; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11508; AVX512-NEXT:    vpermi2d %zmm2, %zmm19, %zmm3
11509; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11510; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
11511; AVX512-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
11512; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm2
11513; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm0
11514; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
11515; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm20
11516; AVX512-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
11517; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
11518; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm21
11519; AVX512-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
11520; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0]
11521; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1
11522; AVX512-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
11523; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11524; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0]
11525; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1
11526; AVX512-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
11527; AVX512-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
11528; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0]
11529; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm23
11530; AVX512-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
11531; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0]
11532; AVX512-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
11533; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
11534; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm6
11535; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm15
11536; AVX512-NEXT:    vpermt2d %zmm6, %zmm3, %zmm15
11537; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm16
11538; AVX512-NEXT:    vpermt2d %zmm6, %zmm4, %zmm16
11539; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm27
11540; AVX512-NEXT:    vpermt2d %zmm6, %zmm12, %zmm27
11541; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm30
11542; AVX512-NEXT:    vpermt2d %zmm6, %zmm14, %zmm30
11543; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm17
11544; AVX512-NEXT:    vpermt2d %zmm0, %zmm5, %zmm17
11545; AVX512-NEXT:    vpermt2d %zmm0, %zmm24, %zmm6
11546; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm10
11547; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm7
11548; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm8
11549; AVX512-NEXT:    vpermt2d %zmm10, %zmm3, %zmm8
11550; AVX512-NEXT:    vmovdqa64 1216(%rdi), %zmm1
11551; AVX512-NEXT:    vmovdqa64 1152(%rdi), %zmm0
11552; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
11553; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm9
11554; AVX512-NEXT:    vpermt2d %zmm10, %zmm4, %zmm9
11555; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
11556; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm13
11557; AVX512-NEXT:    vpermt2d %zmm10, %zmm12, %zmm13
11558; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
11559; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm18
11560; AVX512-NEXT:    vpermt2d %zmm10, %zmm14, %zmm18
11561; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
11562; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm11
11563; AVX512-NEXT:    vpermt2d %zmm7, %zmm5, %zmm11
11564; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm5
11565; AVX512-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
11566; AVX512-NEXT:    vpermt2d %zmm7, %zmm24, %zmm10
11567; AVX512-NEXT:    movb $56, %al
11568; AVX512-NEXT:    kmovw %eax, %k2
11569; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11570; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
11571; AVX512-NEXT:    movw $-2048, %ax # imm = 0xF800
11572; AVX512-NEXT:    kmovw %eax, %k1
11573; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11574; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
11575; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11576; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
11577; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11578; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
11579; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11580; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
11581; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11582; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
11583; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11584; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
11585; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11586; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
11587; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11588; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
11589; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11590; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
11591; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11592; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
11593; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11594; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm21 {%k1}
11595; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11596; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
11597; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11598; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
11599; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11600; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
11601; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11602; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
11603; AVX512-NEXT:    movw $31, %ax
11604; AVX512-NEXT:    kmovw %eax, %k2
11605; AVX512-NEXT:    vmovdqa32 %zmm13, %zmm28 {%k2}
11606; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11607; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm28 {%k1}
11608; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11609; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11610; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
11611; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11612; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
11613; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm13
11614; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11615; AVX512-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
11616; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11617; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
11618; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm24
11619; AVX512-NEXT:    vmovdqa32 %zmm12, %zmm31 {%k2}
11620; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11621; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm31 {%k1}
11622; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm29 {%k2}
11623; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11624; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
11625; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11626; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
11627; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
11628; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11629; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
11630; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
11631; AVX512-NEXT:    vmovdqa32 %zmm30, %zmm12 {%k2}
11632; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11633; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
11634; AVX512-NEXT:    vmovdqa32 %zmm14, %zmm25 {%k2}
11635; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11636; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k1}
11637; AVX512-NEXT:    movw $992, %ax # imm = 0x3E0
11638; AVX512-NEXT:    kmovw %eax, %k1
11639; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11640; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
11641; AVX512-NEXT:    movb $-32, %al
11642; AVX512-NEXT:    kmovw %eax, %k2
11643; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11644; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
11645; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11646; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
11647; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11648; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
11649; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11650; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm17 {%k1}
11651; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11652; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
11653; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11654; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
11655; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11656; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
11657; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11658; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
11659; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11660; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
11661; AVX512-NEXT:    vmovdqa32 %zmm26, %zmm6 {%k1}
11662; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11663; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
11664; AVX512-NEXT:    vmovdqa32 %zmm19, %zmm1 {%k1}
11665; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11666; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
11667; AVX512-NEXT:    vmovdqa32 %zmm22, %zmm10 {%k1}
11668; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11669; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
11670; AVX512-NEXT:    vmovdqa64 %zmm3, 192(%rsi)
11671; AVX512-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
11672; AVX512-NEXT:    vmovdqa64 %zmm20, 64(%rsi)
11673; AVX512-NEXT:    vmovdqa64 %zmm15, (%rsi)
11674; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
11675; AVX512-NEXT:    vmovdqa64 %zmm16, (%rdx)
11676; AVX512-NEXT:    vmovdqa64 %zmm21, 64(%rdx)
11677; AVX512-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
11678; AVX512-NEXT:    vmovdqa64 %zmm31, 192(%rcx)
11679; AVX512-NEXT:    vmovdqa64 %zmm24, (%rcx)
11680; AVX512-NEXT:    vmovdqa64 %zmm13, 64(%rcx)
11681; AVX512-NEXT:    vmovdqa64 %zmm28, 128(%rcx)
11682; AVX512-NEXT:    vmovdqa64 %zmm25, 192(%r8)
11683; AVX512-NEXT:    vmovdqa64 %zmm12, (%r8)
11684; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%r8)
11685; AVX512-NEXT:    vmovdqa64 %zmm29, 128(%r8)
11686; AVX512-NEXT:    vmovdqa64 %zmm5, 192(%r9)
11687; AVX512-NEXT:    vmovdqa64 %zmm17, (%r9)
11688; AVX512-NEXT:    vmovdqa64 %zmm23, 64(%r9)
11689; AVX512-NEXT:    vmovdqa64 %zmm11, 128(%r9)
11690; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11691; AVX512-NEXT:    vmovdqa64 %zmm10, 128(%rax)
11692; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
11693; AVX512-NEXT:    vmovdqa64 %zmm6, (%rax)
11694; AVX512-NEXT:    vmovdqa64 %zmm2, 64(%rax)
11695; AVX512-NEXT:    addq $2632, %rsp # imm = 0xA48
11696; AVX512-NEXT:    vzeroupper
11697; AVX512-NEXT:    retq
11698;
11699; AVX512-FCP-LABEL: load_i32_stride6_vf64:
11700; AVX512-FCP:       # %bb.0:
11701; AVX512-FCP-NEXT:    subq $2632, %rsp # imm = 0xA48
11702; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm21
11703; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm1
11704; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm20
11705; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm0
11706; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm2
11707; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm18
11708; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm27
11709; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm3
11710; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm25
11711; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm4
11712; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
11713; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
11714; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
11715; AVX512-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
11716; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm7
11717; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm6, %zmm7
11718; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11719; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
11720; AVX512-FCP-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
11721; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11722; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm7
11723; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm5, %zmm7
11724; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11725; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm7
11726; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm6, %zmm7
11727; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11728; AVX512-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm6
11729; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11730; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
11731; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
11732; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8
11733; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
11734; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm7, %zmm8
11735; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11736; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
11737; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
11738; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
11739; AVX512-FCP-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
11740; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11741; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
11742; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm7, %zmm0
11743; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11744; AVX512-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
11745; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11746; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
11747; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11748; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm7
11749; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
11750; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11751; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm7
11752; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
11753; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11754; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7
11755; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
11756; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11757; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
11758; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11759; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
11760; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11761; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm7
11762; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
11763; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11764; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm7
11765; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
11766; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11767; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7
11768; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
11769; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11770; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
11771; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11772; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
11773; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
11774; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
11775; AVX512-FCP-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
11776; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11777; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
11778; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11779; AVX512-FCP-NEXT:    vpermt2d %zmm27, %zmm0, %zmm3
11780; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11781; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
11782; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm7, %zmm3
11783; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11784; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm4
11785; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11786; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
11787; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm7, %zmm3
11788; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11789; AVX512-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
11790; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11791; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
11792; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11793; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
11794; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
11795; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11796; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
11797; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11798; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
11799; AVX512-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
11800; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
11801; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm31, %zmm0
11802; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11803; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
11804; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
11805; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
11806; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm25, %zmm0
11807; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11808; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
11809; AVX512-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
11810; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
11811; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm3, %zmm0
11812; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11813; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
11814; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11815; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
11816; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11817; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm26
11818; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm1
11819; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
11820; AVX512-FCP-NEXT:    vpermt2d %zmm26, %zmm5, %zmm2
11821; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11822; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
11823; AVX512-FCP-NEXT:    vpermt2d %zmm26, %zmm8, %zmm2
11824; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11825; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
11826; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm31, %zmm2
11827; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11828; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
11829; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm25, %zmm2
11830; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11831; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
11832; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
11833; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11834; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm26
11835; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm22
11836; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm1
11837; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
11838; AVX512-FCP-NEXT:    vpermt2d %zmm22, %zmm5, %zmm2
11839; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11840; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm19
11841; AVX512-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm2
11842; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm2, %zmm5
11843; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11844; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
11845; AVX512-FCP-NEXT:    vpermt2d %zmm22, %zmm8, %zmm4
11846; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11847; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm2, %zmm8
11848; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11849; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm28
11850; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm31, %zmm28
11851; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm31
11852; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm29
11853; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm25, %zmm29
11854; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm25
11855; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
11856; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm3, %zmm4
11857; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11858; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm3
11859; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11860; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
11861; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
11862; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
11863; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
11864; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
11865; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20
11866; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
11867; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
11868; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21
11869; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
11870; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0]
11871; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
11872; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
11873; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11874; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0]
11875; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
11876; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
11877; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
11878; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0]
11879; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23
11880; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
11881; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0]
11882; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
11883; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
11884; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
11885; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15
11886; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm3, %zmm15
11887; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
11888; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm4, %zmm16
11889; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm27
11890; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm12, %zmm27
11891; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30
11892; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm14, %zmm30
11893; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm17
11894; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm5, %zmm17
11895; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm6
11896; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
11897; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm7
11898; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm8
11899; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm3, %zmm8
11900; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm1
11901; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm0
11902; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
11903; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
11904; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm4, %zmm9
11905; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
11906; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm13
11907; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm12, %zmm13
11908; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
11909; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm18
11910; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm14, %zmm18
11911; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
11912; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11
11913; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm5, %zmm11
11914; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm5
11915; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
11916; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm24, %zmm10
11917; AVX512-FCP-NEXT:    movb $56, %al
11918; AVX512-FCP-NEXT:    kmovw %eax, %k2
11919; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11920; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
11921; AVX512-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
11922; AVX512-FCP-NEXT:    kmovw %eax, %k1
11923; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11924; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
11925; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11926; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
11927; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11928; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
11929; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11930; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
11931; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11932; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
11933; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11934; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
11935; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11936; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
11937; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11938; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
11939; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11940; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
11941; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11942; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
11943; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11944; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm21 {%k1}
11945; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11946; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
11947; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11948; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
11949; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11950; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
11951; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11952; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
11953; AVX512-FCP-NEXT:    movw $31, %ax
11954; AVX512-FCP-NEXT:    kmovw %eax, %k2
11955; AVX512-FCP-NEXT:    vmovdqa32 %zmm13, %zmm28 {%k2}
11956; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11957; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm28 {%k1}
11958; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11959; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11960; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
11961; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11962; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
11963; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm13
11964; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11965; AVX512-FCP-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
11966; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11967; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
11968; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm24
11969; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm31 {%k2}
11970; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11971; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm31 {%k1}
11972; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm29 {%k2}
11973; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11974; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
11975; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11976; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
11977; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
11978; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11979; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
11980; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
11981; AVX512-FCP-NEXT:    vmovdqa32 %zmm30, %zmm12 {%k2}
11982; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11983; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
11984; AVX512-FCP-NEXT:    vmovdqa32 %zmm14, %zmm25 {%k2}
11985; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11986; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k1}
11987; AVX512-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
11988; AVX512-FCP-NEXT:    kmovw %eax, %k1
11989; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11990; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
11991; AVX512-FCP-NEXT:    movb $-32, %al
11992; AVX512-FCP-NEXT:    kmovw %eax, %k2
11993; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11994; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
11995; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11996; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
11997; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11998; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
11999; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12000; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm17 {%k1}
12001; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12002; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
12003; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12004; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
12005; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12006; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
12007; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12008; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
12009; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12010; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
12011; AVX512-FCP-NEXT:    vmovdqa32 %zmm26, %zmm6 {%k1}
12012; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12013; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
12014; AVX512-FCP-NEXT:    vmovdqa32 %zmm19, %zmm1 {%k1}
12015; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12016; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
12017; AVX512-FCP-NEXT:    vmovdqa32 %zmm22, %zmm10 {%k1}
12018; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12019; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
12020; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rsi)
12021; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
12022; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rsi)
12023; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rsi)
12024; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
12025; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
12026; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rdx)
12027; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
12028; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 192(%rcx)
12029; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, (%rcx)
12030; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rcx)
12031; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 128(%rcx)
12032; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 192(%r8)
12033; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, (%r8)
12034; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 64(%r8)
12035; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 128(%r8)
12036; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%r9)
12037; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%r9)
12038; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 64(%r9)
12039; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 128(%r9)
12040; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12041; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
12042; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
12043; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
12044; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
12045; AVX512-FCP-NEXT:    addq $2632, %rsp # imm = 0xA48
12046; AVX512-FCP-NEXT:    vzeroupper
12047; AVX512-FCP-NEXT:    retq
12048;
12049; AVX512DQ-LABEL: load_i32_stride6_vf64:
12050; AVX512DQ:       # %bb.0:
12051; AVX512DQ-NEXT:    subq $2632, %rsp # imm = 0xA48
12052; AVX512DQ-NEXT:    vmovdqa64 1472(%rdi), %zmm21
12053; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm1
12054; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm20
12055; AVX512DQ-NEXT:    vmovdqa64 1024(%rdi), %zmm0
12056; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm2
12057; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm18
12058; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm27
12059; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm3
12060; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm25
12061; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm4
12062; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
12063; AVX512DQ-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
12064; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
12065; AVX512DQ-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
12066; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm7
12067; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm6, %zmm7
12068; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12069; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm7
12070; AVX512DQ-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
12071; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12072; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm7
12073; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm5, %zmm7
12074; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12075; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm7
12076; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm6, %zmm7
12077; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12078; AVX512DQ-NEXT:    vpermi2d %zmm21, %zmm1, %zmm6
12079; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12080; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
12081; AVX512DQ-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
12082; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm8
12083; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm6
12084; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm7, %zmm8
12085; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12086; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
12087; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
12088; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0
12089; AVX512DQ-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
12090; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12091; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm0
12092; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm7, %zmm0
12093; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12094; AVX512DQ-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
12095; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12096; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
12097; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12098; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm7
12099; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
12100; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12101; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm7
12102; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
12103; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12104; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm7
12105; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
12106; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12107; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
12108; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12109; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
12110; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12111; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm7
12112; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
12113; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12114; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm7
12115; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
12116; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12117; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm7
12118; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
12119; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12120; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
12121; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12122; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
12123; AVX512DQ-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
12124; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0
12125; AVX512DQ-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
12126; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12127; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
12128; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12129; AVX512DQ-NEXT:    vpermt2d %zmm27, %zmm0, %zmm3
12130; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12131; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm3
12132; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm7, %zmm3
12133; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12134; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm0, %zmm4
12135; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12136; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm3
12137; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm7, %zmm3
12138; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12139; AVX512DQ-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
12140; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12141; AVX512DQ-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
12142; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12143; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm1
12144; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
12145; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12146; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
12147; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12148; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
12149; AVX512DQ-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
12150; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm0
12151; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm31, %zmm0
12152; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12153; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
12154; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
12155; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm0
12156; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm25, %zmm0
12157; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12158; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
12159; AVX512DQ-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
12160; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm0
12161; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm3, %zmm0
12162; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12163; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
12164; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12165; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
12166; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12167; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm26
12168; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm1
12169; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2
12170; AVX512DQ-NEXT:    vpermt2d %zmm26, %zmm5, %zmm2
12171; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12172; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2
12173; AVX512DQ-NEXT:    vpermt2d %zmm26, %zmm8, %zmm2
12174; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12175; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm2
12176; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm31, %zmm2
12177; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12178; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm2
12179; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm25, %zmm2
12180; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12181; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm2
12182; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
12183; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12184; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm26
12185; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm22
12186; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm1
12187; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2
12188; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm5, %zmm2
12189; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12190; AVX512DQ-NEXT:    vmovdqa64 1280(%rdi), %zmm19
12191; AVX512DQ-NEXT:    vmovdqa64 1344(%rdi), %zmm2
12192; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm2, %zmm5
12193; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12194; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4
12195; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm8, %zmm4
12196; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12197; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm2, %zmm8
12198; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12199; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm28
12200; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm31, %zmm28
12201; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm19, %zmm31
12202; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm29
12203; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm25, %zmm29
12204; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm19, %zmm25
12205; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm4
12206; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm3, %zmm4
12207; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12208; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm19, %zmm3
12209; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12210; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
12211; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
12212; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm2
12213; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm0
12214; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
12215; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm20
12216; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
12217; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
12218; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm21
12219; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
12220; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0]
12221; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm1
12222; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
12223; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12224; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0]
12225; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm1
12226; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
12227; AVX512DQ-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
12228; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0]
12229; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm23
12230; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
12231; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0]
12232; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
12233; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
12234; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm6
12235; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm15
12236; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm3, %zmm15
12237; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm16
12238; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm4, %zmm16
12239; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm27
12240; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm12, %zmm27
12241; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm30
12242; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm14, %zmm30
12243; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm17
12244; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm5, %zmm17
12245; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm24, %zmm6
12246; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm10
12247; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm7
12248; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm8
12249; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm3, %zmm8
12250; AVX512DQ-NEXT:    vmovdqa64 1216(%rdi), %zmm1
12251; AVX512DQ-NEXT:    vmovdqa64 1152(%rdi), %zmm0
12252; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
12253; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm9
12254; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm4, %zmm9
12255; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
12256; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm13
12257; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm12, %zmm13
12258; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
12259; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm18
12260; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm14, %zmm18
12261; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
12262; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm11
12263; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm5, %zmm11
12264; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm5
12265; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
12266; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm24, %zmm10
12267; AVX512DQ-NEXT:    movb $56, %al
12268; AVX512DQ-NEXT:    kmovw %eax, %k2
12269; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12270; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
12271; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
12272; AVX512DQ-NEXT:    kmovw %eax, %k1
12273; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12274; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
12275; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12276; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
12277; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12278; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
12279; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12280; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
12281; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12282; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
12283; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12284; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
12285; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12286; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
12287; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12288; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
12289; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12290; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
12291; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12292; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
12293; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12294; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm21 {%k1}
12295; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12296; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
12297; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12298; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
12299; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12300; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
12301; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12302; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
12303; AVX512DQ-NEXT:    movw $31, %ax
12304; AVX512DQ-NEXT:    kmovw %eax, %k2
12305; AVX512DQ-NEXT:    vmovdqa32 %zmm13, %zmm28 {%k2}
12306; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12307; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm28 {%k1}
12308; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12309; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12310; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
12311; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12312; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
12313; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm13
12314; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12315; AVX512DQ-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
12316; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12317; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
12318; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm24
12319; AVX512DQ-NEXT:    vmovdqa32 %zmm12, %zmm31 {%k2}
12320; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12321; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm31 {%k1}
12322; AVX512DQ-NEXT:    vmovdqa32 %zmm18, %zmm29 {%k2}
12323; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12324; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
12325; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12326; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
12327; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
12328; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12329; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
12330; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
12331; AVX512DQ-NEXT:    vmovdqa32 %zmm30, %zmm12 {%k2}
12332; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12333; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
12334; AVX512DQ-NEXT:    vmovdqa32 %zmm14, %zmm25 {%k2}
12335; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12336; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k1}
12337; AVX512DQ-NEXT:    movw $992, %ax # imm = 0x3E0
12338; AVX512DQ-NEXT:    kmovw %eax, %k1
12339; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12340; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
12341; AVX512DQ-NEXT:    movb $-32, %al
12342; AVX512DQ-NEXT:    kmovw %eax, %k2
12343; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12344; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
12345; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12346; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
12347; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12348; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
12349; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12350; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm17 {%k1}
12351; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12352; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
12353; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12354; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
12355; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12356; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
12357; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12358; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
12359; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12360; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
12361; AVX512DQ-NEXT:    vmovdqa32 %zmm26, %zmm6 {%k1}
12362; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12363; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
12364; AVX512DQ-NEXT:    vmovdqa32 %zmm19, %zmm1 {%k1}
12365; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12366; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
12367; AVX512DQ-NEXT:    vmovdqa32 %zmm22, %zmm10 {%k1}
12368; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12369; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
12370; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rsi)
12371; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
12372; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 64(%rsi)
12373; AVX512DQ-NEXT:    vmovdqa64 %zmm15, (%rsi)
12374; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
12375; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%rdx)
12376; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 64(%rdx)
12377; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
12378; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 192(%rcx)
12379; AVX512DQ-NEXT:    vmovdqa64 %zmm24, (%rcx)
12380; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 64(%rcx)
12381; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 128(%rcx)
12382; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 192(%r8)
12383; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%r8)
12384; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%r8)
12385; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 128(%r8)
12386; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 192(%r9)
12387; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%r9)
12388; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 64(%r9)
12389; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 128(%r9)
12390; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12391; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 128(%rax)
12392; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
12393; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%rax)
12394; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rax)
12395; AVX512DQ-NEXT:    addq $2632, %rsp # imm = 0xA48
12396; AVX512DQ-NEXT:    vzeroupper
12397; AVX512DQ-NEXT:    retq
12398;
12399; AVX512DQ-FCP-LABEL: load_i32_stride6_vf64:
12400; AVX512DQ-FCP:       # %bb.0:
12401; AVX512DQ-FCP-NEXT:    subq $2632, %rsp # imm = 0xA48
12402; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm21
12403; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm1
12404; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm20
12405; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm0
12406; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm2
12407; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm18
12408; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm27
12409; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm3
12410; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm25
12411; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm4
12412; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
12413; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
12414; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
12415; AVX512DQ-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
12416; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm7
12417; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm6, %zmm7
12418; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12419; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
12420; AVX512DQ-FCP-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
12421; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12422; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm7
12423; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm5, %zmm7
12424; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12425; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm7
12426; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm6, %zmm7
12427; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12428; AVX512DQ-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm6
12429; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12430; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
12431; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
12432; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8
12433; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
12434; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm7, %zmm8
12435; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12436; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
12437; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
12438; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
12439; AVX512DQ-FCP-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
12440; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12441; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
12442; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm7, %zmm0
12443; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12444; AVX512DQ-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
12445; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12446; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
12447; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12448; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm7
12449; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
12450; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12451; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm7
12452; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
12453; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12454; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7
12455; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
12456; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12457; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
12458; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12459; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
12460; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12461; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm7
12462; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
12463; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12464; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm7
12465; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
12466; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12467; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7
12468; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
12469; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12470; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
12471; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12472; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
12473; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
12474; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
12475; AVX512DQ-FCP-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
12476; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12477; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
12478; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12479; AVX512DQ-FCP-NEXT:    vpermt2d %zmm27, %zmm0, %zmm3
12480; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12481; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
12482; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm7, %zmm3
12483; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12484; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm4
12485; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12486; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
12487; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm7, %zmm3
12488; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12489; AVX512DQ-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
12490; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12491; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
12492; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12493; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
12494; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
12495; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12496; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
12497; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12498; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
12499; AVX512DQ-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
12500; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
12501; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm31, %zmm0
12502; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12503; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
12504; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
12505; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
12506; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm25, %zmm0
12507; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12508; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
12509; AVX512DQ-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
12510; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
12511; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm3, %zmm0
12512; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12513; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
12514; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12515; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
12516; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12517; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm26
12518; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm1
12519; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
12520; AVX512DQ-FCP-NEXT:    vpermt2d %zmm26, %zmm5, %zmm2
12521; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12522; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
12523; AVX512DQ-FCP-NEXT:    vpermt2d %zmm26, %zmm8, %zmm2
12524; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12525; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
12526; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm31, %zmm2
12527; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12528; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
12529; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm25, %zmm2
12530; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12531; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
12532; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
12533; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12534; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm26
12535; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm22
12536; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm1
12537; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
12538; AVX512DQ-FCP-NEXT:    vpermt2d %zmm22, %zmm5, %zmm2
12539; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12540; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm19
12541; AVX512DQ-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm2
12542; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm2, %zmm5
12543; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12544; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
12545; AVX512DQ-FCP-NEXT:    vpermt2d %zmm22, %zmm8, %zmm4
12546; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12547; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm2, %zmm8
12548; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12549; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm28
12550; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm31, %zmm28
12551; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm31
12552; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm29
12553; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm25, %zmm29
12554; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm25
12555; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
12556; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm3, %zmm4
12557; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12558; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm3
12559; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12560; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
12561; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
12562; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
12563; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
12564; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
12565; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20
12566; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
12567; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
12568; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21
12569; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
12570; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0]
12571; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
12572; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
12573; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12574; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0]
12575; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
12576; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
12577; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
12578; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0]
12579; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23
12580; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
12581; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0]
12582; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
12583; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
12584; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
12585; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15
12586; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm3, %zmm15
12587; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
12588; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm4, %zmm16
12589; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm27
12590; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm12, %zmm27
12591; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30
12592; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm14, %zmm30
12593; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm17
12594; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm5, %zmm17
12595; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm6
12596; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
12597; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm7
12598; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm8
12599; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm3, %zmm8
12600; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm1
12601; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm0
12602; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
12603; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
12604; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm4, %zmm9
12605; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
12606; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm13
12607; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm12, %zmm13
12608; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
12609; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm18
12610; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm14, %zmm18
12611; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
12612; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11
12613; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm5, %zmm11
12614; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm5
12615; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
12616; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm24, %zmm10
12617; AVX512DQ-FCP-NEXT:    movb $56, %al
12618; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
12619; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12620; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
12621; AVX512DQ-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
12622; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
12623; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12624; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
12625; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12626; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
12627; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12628; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
12629; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12630; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
12631; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12632; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
12633; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12634; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
12635; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12636; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
12637; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12638; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
12639; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12640; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
12641; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12642; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
12643; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12644; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm21 {%k1}
12645; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12646; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
12647; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12648; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
12649; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12650; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
12651; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12652; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
12653; AVX512DQ-FCP-NEXT:    movw $31, %ax
12654; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
12655; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm13, %zmm28 {%k2}
12656; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12657; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm28 {%k1}
12658; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12659; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12660; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
12661; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12662; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
12663; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm13
12664; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12665; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
12666; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12667; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
12668; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm24
12669; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm12, %zmm31 {%k2}
12670; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12671; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm31 {%k1}
12672; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm18, %zmm29 {%k2}
12673; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12674; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
12675; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12676; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
12677; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
12678; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12679; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
12680; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
12681; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm30, %zmm12 {%k2}
12682; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12683; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
12684; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm14, %zmm25 {%k2}
12685; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12686; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k1}
12687; AVX512DQ-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
12688; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
12689; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12690; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
12691; AVX512DQ-FCP-NEXT:    movb $-32, %al
12692; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
12693; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12694; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
12695; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12696; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
12697; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12698; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
12699; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12700; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm17 {%k1}
12701; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12702; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
12703; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12704; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
12705; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12706; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
12707; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12708; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
12709; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12710; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
12711; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm26, %zmm6 {%k1}
12712; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12713; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
12714; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm19, %zmm1 {%k1}
12715; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12716; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
12717; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm22, %zmm10 {%k1}
12718; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12719; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
12720; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rsi)
12721; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
12722; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rsi)
12723; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, (%rsi)
12724; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
12725; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
12726; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rdx)
12727; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
12728; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 192(%rcx)
12729; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, (%rcx)
12730; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rcx)
12731; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 128(%rcx)
12732; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 192(%r8)
12733; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, (%r8)
12734; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 64(%r8)
12735; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 128(%r8)
12736; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 192(%r9)
12737; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%r9)
12738; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 64(%r9)
12739; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 128(%r9)
12740; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12741; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
12742; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
12743; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
12744; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
12745; AVX512DQ-FCP-NEXT:    addq $2632, %rsp # imm = 0xA48
12746; AVX512DQ-FCP-NEXT:    vzeroupper
12747; AVX512DQ-FCP-NEXT:    retq
12748;
12749; AVX512BW-LABEL: load_i32_stride6_vf64:
12750; AVX512BW:       # %bb.0:
12751; AVX512BW-NEXT:    subq $2632, %rsp # imm = 0xA48
12752; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm21
12753; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm1
12754; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm20
12755; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm0
12756; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
12757; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm18
12758; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm27
12759; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm3
12760; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm25
12761; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm4
12762; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
12763; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
12764; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
12765; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
12766; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7
12767; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm6, %zmm7
12768; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12769; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
12770; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
12771; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12772; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm7
12773; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm5, %zmm7
12774; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12775; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
12776; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm6, %zmm7
12777; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12778; AVX512BW-NEXT:    vpermi2d %zmm21, %zmm1, %zmm6
12779; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12780; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
12781; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
12782; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
12783; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm6
12784; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm7, %zmm8
12785; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12786; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
12787; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
12788; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
12789; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
12790; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12791; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
12792; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm7, %zmm0
12793; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12794; AVX512BW-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
12795; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12796; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
12797; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12798; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm7
12799; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
12800; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12801; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm7
12802; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
12803; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12804; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm7
12805; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
12806; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12807; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
12808; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12809; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
12810; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12811; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm7
12812; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
12813; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12814; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm7
12815; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
12816; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12817; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm7
12818; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
12819; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12820; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
12821; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12822; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
12823; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
12824; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
12825; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
12826; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12827; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
12828; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12829; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm3
12830; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12831; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
12832; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm7, %zmm3
12833; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12834; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm4
12835; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12836; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
12837; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm7, %zmm3
12838; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12839; AVX512BW-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
12840; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12841; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
12842; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12843; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm1
12844; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
12845; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12846; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
12847; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12848; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
12849; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
12850; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
12851; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm31, %zmm0
12852; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12853; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
12854; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
12855; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
12856; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm25, %zmm0
12857; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12858; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
12859; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
12860; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
12861; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm3, %zmm0
12862; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12863; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
12864; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
12865; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
12866; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12867; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm26
12868; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm1
12869; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
12870; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm5, %zmm2
12871; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12872; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
12873; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm8, %zmm2
12874; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12875; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
12876; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm31, %zmm2
12877; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12878; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
12879; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm25, %zmm2
12880; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12881; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
12882; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
12883; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12884; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm26
12885; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm22
12886; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm1
12887; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
12888; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm5, %zmm2
12889; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12890; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm19
12891; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm2
12892; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm2, %zmm5
12893; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12894; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
12895; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm8, %zmm4
12896; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12897; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm2, %zmm8
12898; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12899; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm28
12900; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm31, %zmm28
12901; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm19, %zmm31
12902; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm29
12903; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm25, %zmm29
12904; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm19, %zmm25
12905; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
12906; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm4
12907; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12908; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm19, %zmm3
12909; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12910; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
12911; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
12912; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm2
12913; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm0
12914; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
12915; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20
12916; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
12917; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
12918; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21
12919; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
12920; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0]
12921; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
12922; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
12923; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12924; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0]
12925; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
12926; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
12927; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
12928; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0]
12929; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm23
12930; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
12931; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0]
12932; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
12933; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
12934; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
12935; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15
12936; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm3, %zmm15
12937; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16
12938; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm4, %zmm16
12939; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27
12940; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm12, %zmm27
12941; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm30
12942; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm14, %zmm30
12943; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm17
12944; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm17
12945; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm24, %zmm6
12946; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm10
12947; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm7
12948; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8
12949; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm3, %zmm8
12950; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
12951; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm0
12952; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
12953; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
12954; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm4, %zmm9
12955; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
12956; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
12957; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm12, %zmm13
12958; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
12959; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm18
12960; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm14, %zmm18
12961; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
12962; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11
12963; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm5, %zmm11
12964; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm5
12965; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
12966; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm24, %zmm10
12967; AVX512BW-NEXT:    movb $56, %al
12968; AVX512BW-NEXT:    kmovd %eax, %k2
12969; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12970; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
12971; AVX512BW-NEXT:    movw $-2048, %ax # imm = 0xF800
12972; AVX512BW-NEXT:    kmovd %eax, %k1
12973; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12974; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
12975; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12976; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
12977; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12978; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
12979; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12980; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
12981; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12982; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
12983; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12984; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
12985; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12986; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
12987; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12988; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
12989; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12990; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
12991; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12992; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
12993; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12994; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm21 {%k1}
12995; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12996; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
12997; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12998; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
12999; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13000; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
13001; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13002; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
13003; AVX512BW-NEXT:    movw $31, %ax
13004; AVX512BW-NEXT:    kmovd %eax, %k2
13005; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm28 {%k2}
13006; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13007; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm28 {%k1}
13008; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13009; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13010; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
13011; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13012; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13013; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
13014; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13015; AVX512BW-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
13016; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13017; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13018; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm24
13019; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm31 {%k2}
13020; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13021; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm31 {%k1}
13022; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm29 {%k2}
13023; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13024; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
13025; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13026; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
13027; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
13028; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13029; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13030; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
13031; AVX512BW-NEXT:    vmovdqa32 %zmm30, %zmm12 {%k2}
13032; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13033; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
13034; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm25 {%k2}
13035; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13036; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k1}
13037; AVX512BW-NEXT:    movw $992, %ax # imm = 0x3E0
13038; AVX512BW-NEXT:    kmovd %eax, %k1
13039; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13040; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
13041; AVX512BW-NEXT:    movb $-32, %al
13042; AVX512BW-NEXT:    kmovd %eax, %k2
13043; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13044; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
13045; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13046; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
13047; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13048; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
13049; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13050; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm17 {%k1}
13051; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13052; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
13053; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13054; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
13055; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13056; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
13057; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13058; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
13059; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13060; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
13061; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm6 {%k1}
13062; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13063; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
13064; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm1 {%k1}
13065; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13066; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
13067; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm10 {%k1}
13068; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13069; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
13070; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rsi)
13071; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
13072; AVX512BW-NEXT:    vmovdqa64 %zmm20, 64(%rsi)
13073; AVX512BW-NEXT:    vmovdqa64 %zmm15, (%rsi)
13074; AVX512BW-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
13075; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rdx)
13076; AVX512BW-NEXT:    vmovdqa64 %zmm21, 64(%rdx)
13077; AVX512BW-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
13078; AVX512BW-NEXT:    vmovdqa64 %zmm31, 192(%rcx)
13079; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%rcx)
13080; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rcx)
13081; AVX512BW-NEXT:    vmovdqa64 %zmm28, 128(%rcx)
13082; AVX512BW-NEXT:    vmovdqa64 %zmm25, 192(%r8)
13083; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%r8)
13084; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%r8)
13085; AVX512BW-NEXT:    vmovdqa64 %zmm29, 128(%r8)
13086; AVX512BW-NEXT:    vmovdqa64 %zmm5, 192(%r9)
13087; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%r9)
13088; AVX512BW-NEXT:    vmovdqa64 %zmm23, 64(%r9)
13089; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%r9)
13090; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13091; AVX512BW-NEXT:    vmovdqa64 %zmm10, 128(%rax)
13092; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
13093; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rax)
13094; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
13095; AVX512BW-NEXT:    addq $2632, %rsp # imm = 0xA48
13096; AVX512BW-NEXT:    vzeroupper
13097; AVX512BW-NEXT:    retq
13098;
13099; AVX512BW-FCP-LABEL: load_i32_stride6_vf64:
13100; AVX512BW-FCP:       # %bb.0:
13101; AVX512BW-FCP-NEXT:    subq $2632, %rsp # imm = 0xA48
13102; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm21
13103; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm1
13104; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm20
13105; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm0
13106; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm2
13107; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm18
13108; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm27
13109; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm3
13110; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm25
13111; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm4
13112; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
13113; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
13114; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
13115; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
13116; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm7
13117; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm6, %zmm7
13118; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13119; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
13120; AVX512BW-FCP-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
13121; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13122; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm7
13123; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm5, %zmm7
13124; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13125; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm7
13126; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm6, %zmm7
13127; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13128; AVX512BW-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm6
13129; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13130; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
13131; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
13132; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8
13133; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
13134; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm7, %zmm8
13135; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13136; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
13137; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
13138; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
13139; AVX512BW-FCP-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
13140; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13141; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
13142; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm7, %zmm0
13143; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13144; AVX512BW-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
13145; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13146; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
13147; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13148; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm7
13149; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
13150; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13151; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm7
13152; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
13153; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13154; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7
13155; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
13156; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13157; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
13158; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13159; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
13160; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13161; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm7
13162; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
13163; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13164; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm7
13165; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
13166; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13167; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7
13168; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
13169; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13170; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
13171; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13172; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
13173; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
13174; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
13175; AVX512BW-FCP-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
13176; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13177; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
13178; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13179; AVX512BW-FCP-NEXT:    vpermt2d %zmm27, %zmm0, %zmm3
13180; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13181; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
13182; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm7, %zmm3
13183; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13184; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm4
13185; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13186; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
13187; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm7, %zmm3
13188; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13189; AVX512BW-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
13190; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13191; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
13192; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13193; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
13194; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
13195; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13196; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
13197; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13198; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
13199; AVX512BW-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
13200; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
13201; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm31, %zmm0
13202; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13203; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
13204; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
13205; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
13206; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm25, %zmm0
13207; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13208; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
13209; AVX512BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
13210; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
13211; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm3, %zmm0
13212; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13213; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
13214; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13215; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
13216; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13217; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm26
13218; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm1
13219; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
13220; AVX512BW-FCP-NEXT:    vpermt2d %zmm26, %zmm5, %zmm2
13221; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13222; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
13223; AVX512BW-FCP-NEXT:    vpermt2d %zmm26, %zmm8, %zmm2
13224; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13225; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
13226; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm31, %zmm2
13227; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13228; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
13229; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm25, %zmm2
13230; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13231; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
13232; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
13233; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13234; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm26
13235; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm22
13236; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm1
13237; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
13238; AVX512BW-FCP-NEXT:    vpermt2d %zmm22, %zmm5, %zmm2
13239; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13240; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm19
13241; AVX512BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm2
13242; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm2, %zmm5
13243; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13244; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
13245; AVX512BW-FCP-NEXT:    vpermt2d %zmm22, %zmm8, %zmm4
13246; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13247; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm2, %zmm8
13248; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13249; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm28
13250; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm31, %zmm28
13251; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm31
13252; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm29
13253; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm25, %zmm29
13254; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm25
13255; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
13256; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm3, %zmm4
13257; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13258; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm3
13259; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13260; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
13261; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
13262; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
13263; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
13264; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
13265; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20
13266; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
13267; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
13268; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21
13269; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
13270; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0]
13271; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
13272; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
13273; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13274; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0]
13275; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
13276; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
13277; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
13278; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0]
13279; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23
13280; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
13281; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0]
13282; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
13283; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
13284; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
13285; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15
13286; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm3, %zmm15
13287; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
13288; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm4, %zmm16
13289; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm27
13290; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm12, %zmm27
13291; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30
13292; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm14, %zmm30
13293; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm17
13294; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm5, %zmm17
13295; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm6
13296; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
13297; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm7
13298; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm8
13299; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm3, %zmm8
13300; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm1
13301; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm0
13302; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
13303; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
13304; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm4, %zmm9
13305; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
13306; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm13
13307; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm12, %zmm13
13308; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
13309; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm18
13310; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm14, %zmm18
13311; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
13312; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11
13313; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm5, %zmm11
13314; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm5
13315; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
13316; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm24, %zmm10
13317; AVX512BW-FCP-NEXT:    movb $56, %al
13318; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
13319; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13320; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
13321; AVX512BW-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
13322; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
13323; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13324; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
13325; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13326; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
13327; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13328; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
13329; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13330; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
13331; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13332; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
13333; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13334; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
13335; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13336; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
13337; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13338; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
13339; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13340; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
13341; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13342; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
13343; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13344; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm21 {%k1}
13345; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13346; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
13347; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13348; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
13349; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13350; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
13351; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13352; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
13353; AVX512BW-FCP-NEXT:    movw $31, %ax
13354; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
13355; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm13, %zmm28 {%k2}
13356; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13357; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm28 {%k1}
13358; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13359; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13360; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
13361; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13362; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13363; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm13
13364; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13365; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
13366; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13367; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13368; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm24
13369; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm12, %zmm31 {%k2}
13370; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13371; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm31 {%k1}
13372; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm29 {%k2}
13373; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13374; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
13375; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13376; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
13377; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
13378; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13379; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13380; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
13381; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm30, %zmm12 {%k2}
13382; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13383; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
13384; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm14, %zmm25 {%k2}
13385; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13386; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k1}
13387; AVX512BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
13388; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
13389; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13390; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
13391; AVX512BW-FCP-NEXT:    movb $-32, %al
13392; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
13393; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13394; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
13395; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13396; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
13397; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13398; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
13399; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13400; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm17 {%k1}
13401; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13402; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
13403; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13404; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
13405; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13406; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
13407; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13408; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
13409; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13410; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
13411; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm6 {%k1}
13412; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13413; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
13414; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm1 {%k1}
13415; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13416; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
13417; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm10 {%k1}
13418; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13419; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
13420; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rsi)
13421; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
13422; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rsi)
13423; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rsi)
13424; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
13425; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
13426; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rdx)
13427; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
13428; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, 192(%rcx)
13429; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, (%rcx)
13430; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rcx)
13431; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, 128(%rcx)
13432; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 192(%r8)
13433; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%r8)
13434; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%r8)
13435; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, 128(%r8)
13436; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 192(%r9)
13437; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, (%r9)
13438; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, 64(%r9)
13439; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, 128(%r9)
13440; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13441; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
13442; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
13443; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
13444; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
13445; AVX512BW-FCP-NEXT:    addq $2632, %rsp # imm = 0xA48
13446; AVX512BW-FCP-NEXT:    vzeroupper
13447; AVX512BW-FCP-NEXT:    retq
13448;
13449; AVX512DQ-BW-LABEL: load_i32_stride6_vf64:
13450; AVX512DQ-BW:       # %bb.0:
13451; AVX512DQ-BW-NEXT:    subq $2632, %rsp # imm = 0xA48
13452; AVX512DQ-BW-NEXT:    vmovdqa64 1472(%rdi), %zmm21
13453; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm1
13454; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm20
13455; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm0
13456; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
13457; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm18
13458; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm27
13459; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm3
13460; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm25
13461; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm4
13462; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
13463; AVX512DQ-BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
13464; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
13465; AVX512DQ-BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
13466; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm7
13467; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm6, %zmm7
13468; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13469; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm7
13470; AVX512DQ-BW-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
13471; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13472; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm7
13473; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm5, %zmm7
13474; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13475; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm7
13476; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm6, %zmm7
13477; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13478; AVX512DQ-BW-NEXT:    vpermi2d %zmm21, %zmm1, %zmm6
13479; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13480; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
13481; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
13482; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm8
13483; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm6
13484; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm7, %zmm8
13485; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13486; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
13487; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
13488; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0
13489; AVX512DQ-BW-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
13490; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13491; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm0
13492; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm7, %zmm0
13493; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13494; AVX512DQ-BW-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
13495; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13496; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
13497; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13498; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, %zmm7
13499; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
13500; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13501; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm7
13502; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
13503; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13504; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm7
13505; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
13506; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13507; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
13508; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13509; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
13510; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13511; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, %zmm7
13512; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
13513; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13514; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm7
13515; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
13516; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13517; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm7
13518; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
13519; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13520; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
13521; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13522; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
13523; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
13524; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0
13525; AVX512DQ-BW-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
13526; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13527; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
13528; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13529; AVX512DQ-BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm3
13530; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13531; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm3
13532; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm7, %zmm3
13533; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13534; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm4
13535; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13536; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm3
13537; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm7, %zmm3
13538; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13539; AVX512DQ-BW-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
13540; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13541; AVX512DQ-BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
13542; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13543; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm1
13544; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
13545; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13546; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
13547; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13548; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
13549; AVX512DQ-BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
13550; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm0
13551; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm31, %zmm0
13552; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13553; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
13554; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
13555; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm0
13556; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm25, %zmm0
13557; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13558; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
13559; AVX512DQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
13560; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm0
13561; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm3, %zmm0
13562; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13563; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
13564; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13565; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
13566; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13567; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm26
13568; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm1
13569; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2
13570; AVX512DQ-BW-NEXT:    vpermt2d %zmm26, %zmm5, %zmm2
13571; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13572; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2
13573; AVX512DQ-BW-NEXT:    vpermt2d %zmm26, %zmm8, %zmm2
13574; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13575; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm2
13576; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm31, %zmm2
13577; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13578; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm2
13579; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm25, %zmm2
13580; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13581; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm2
13582; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
13583; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13584; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm26
13585; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm22
13586; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm1
13587; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2
13588; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm5, %zmm2
13589; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13590; AVX512DQ-BW-NEXT:    vmovdqa64 1280(%rdi), %zmm19
13591; AVX512DQ-BW-NEXT:    vmovdqa64 1344(%rdi), %zmm2
13592; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm2, %zmm5
13593; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13594; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4
13595; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm8, %zmm4
13596; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13597; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm2, %zmm8
13598; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13599; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm28
13600; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm31, %zmm28
13601; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm19, %zmm31
13602; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm29
13603; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm25, %zmm29
13604; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm19, %zmm25
13605; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm4
13606; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm4
13607; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13608; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm19, %zmm3
13609; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13610; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
13611; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
13612; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm2
13613; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm0
13614; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
13615; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm20
13616; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
13617; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
13618; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm21
13619; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
13620; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0]
13621; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm1
13622; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
13623; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13624; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0]
13625; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm1
13626; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
13627; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
13628; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0]
13629; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm23
13630; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
13631; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0]
13632; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
13633; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
13634; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
13635; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm15
13636; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm3, %zmm15
13637; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm16
13638; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm4, %zmm16
13639; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm27
13640; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm12, %zmm27
13641; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm30
13642; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm14, %zmm30
13643; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm17
13644; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm17
13645; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm24, %zmm6
13646; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm10
13647; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm7
13648; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm8
13649; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm3, %zmm8
13650; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
13651; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm0
13652; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
13653; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm9
13654; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm4, %zmm9
13655; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
13656; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm13
13657; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm12, %zmm13
13658; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
13659; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm18
13660; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm14, %zmm18
13661; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
13662; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm11
13663; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm5, %zmm11
13664; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm5
13665; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
13666; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm24, %zmm10
13667; AVX512DQ-BW-NEXT:    movb $56, %al
13668; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
13669; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13670; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
13671; AVX512DQ-BW-NEXT:    movw $-2048, %ax # imm = 0xF800
13672; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
13673; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13674; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
13675; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13676; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
13677; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13678; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
13679; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13680; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
13681; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13682; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
13683; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13684; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
13685; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13686; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
13687; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13688; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
13689; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13690; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
13691; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13692; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
13693; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13694; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm21 {%k1}
13695; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13696; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
13697; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13698; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
13699; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13700; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
13701; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13702; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
13703; AVX512DQ-BW-NEXT:    movw $31, %ax
13704; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
13705; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm13, %zmm28 {%k2}
13706; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13707; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm28 {%k1}
13708; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13709; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13710; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
13711; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13712; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13713; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm13
13714; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13715; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
13716; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13717; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13718; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm24
13719; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm12, %zmm31 {%k2}
13720; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13721; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm31 {%k1}
13722; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm29 {%k2}
13723; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13724; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
13725; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
13726; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
13727; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
13728; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13729; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
13730; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
13731; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm30, %zmm12 {%k2}
13732; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13733; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
13734; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm14, %zmm25 {%k2}
13735; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13736; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k1}
13737; AVX512DQ-BW-NEXT:    movw $992, %ax # imm = 0x3E0
13738; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
13739; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13740; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
13741; AVX512DQ-BW-NEXT:    movb $-32, %al
13742; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
13743; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13744; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
13745; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13746; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
13747; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13748; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
13749; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13750; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm17 {%k1}
13751; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13752; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
13753; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13754; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
13755; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13756; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
13757; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13758; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
13759; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13760; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
13761; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm26, %zmm6 {%k1}
13762; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13763; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
13764; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm1 {%k1}
13765; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13766; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
13767; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm10 {%k1}
13768; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13769; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
13770; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 192(%rsi)
13771; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
13772; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 64(%rsi)
13773; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, (%rsi)
13774; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
13775; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, (%rdx)
13776; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, 64(%rdx)
13777; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
13778; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm31, 192(%rcx)
13779; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, (%rcx)
13780; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, 64(%rcx)
13781; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, 128(%rcx)
13782; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, 192(%r8)
13783; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, (%r8)
13784; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 64(%r8)
13785; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, 128(%r8)
13786; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 192(%r9)
13787; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, (%r9)
13788; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, 64(%r9)
13789; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, 128(%r9)
13790; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13791; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 128(%rax)
13792; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
13793; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, (%rax)
13794; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
13795; AVX512DQ-BW-NEXT:    addq $2632, %rsp # imm = 0xA48
13796; AVX512DQ-BW-NEXT:    vzeroupper
13797; AVX512DQ-BW-NEXT:    retq
13798;
13799; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf64:
13800; AVX512DQ-BW-FCP:       # %bb.0:
13801; AVX512DQ-BW-FCP-NEXT:    subq $2632, %rsp # imm = 0xA48
13802; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm21
13803; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm1
13804; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm20
13805; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm0
13806; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm2
13807; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm18
13808; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm27
13809; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm3
13810; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm25
13811; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm4
13812; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
13813; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
13814; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
13815; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
13816; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm7
13817; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm6, %zmm7
13818; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13819; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
13820; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
13821; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13822; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm7
13823; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm5, %zmm7
13824; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13825; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm7
13826; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm6, %zmm7
13827; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13828; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm6
13829; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13830; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
13831; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
13832; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8
13833; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
13834; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm7, %zmm8
13835; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13836; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
13837; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
13838; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
13839; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
13840; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13841; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
13842; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm7, %zmm0
13843; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13844; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
13845; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13846; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
13847; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13848; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm7
13849; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
13850; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13851; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm7
13852; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
13853; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13854; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7
13855; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
13856; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13857; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
13858; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13859; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
13860; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13861; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm7
13862; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm7
13863; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13864; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm7
13865; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm7
13866; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13867; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7
13868; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm0, %zmm7
13869; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13870; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm21, %zmm0
13871; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13872; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
13873; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
13874; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
13875; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm27, %zmm7, %zmm0
13876; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13877; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
13878; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13879; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm27, %zmm0, %zmm3
13880; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13881; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
13882; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm7, %zmm3
13883; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13884; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm4
13885; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13886; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
13887; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm7, %zmm3
13888; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13889; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm21, %zmm1, %zmm7
13890; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13891; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
13892; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13893; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
13894; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
13895; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13896; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
13897; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13898; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
13899; AVX512DQ-BW-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
13900; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
13901; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm31, %zmm0
13902; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13903; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
13904; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
13905; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
13906; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm25, %zmm0
13907; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13908; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
13909; AVX512DQ-BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
13910; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
13911; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm3, %zmm0
13912; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13913; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
13914; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
13915; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
13916; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13917; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm26
13918; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm1
13919; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
13920; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm26, %zmm5, %zmm2
13921; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13922; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
13923; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm26, %zmm8, %zmm2
13924; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13925; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
13926; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm31, %zmm2
13927; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13928; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
13929; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm25, %zmm2
13930; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13931; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
13932; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
13933; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13934; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm26
13935; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm22
13936; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm1
13937; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
13938; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm22, %zmm5, %zmm2
13939; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13940; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm19
13941; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm2
13942; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm2, %zmm5
13943; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13944; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
13945; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm22, %zmm8, %zmm4
13946; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13947; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm2, %zmm8
13948; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13949; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm28
13950; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm31, %zmm28
13951; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm31
13952; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm29
13953; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm25, %zmm29
13954; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm25
13955; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
13956; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm3, %zmm4
13957; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13958; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm19, %zmm3
13959; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13960; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
13961; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
13962; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
13963; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
13964; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
13965; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20
13966; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
13967; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
13968; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21
13969; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
13970; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0]
13971; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
13972; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
13973; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13974; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0]
13975; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
13976; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
13977; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
13978; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0]
13979; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23
13980; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
13981; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0]
13982; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
13983; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
13984; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
13985; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15
13986; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm3, %zmm15
13987; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
13988; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm4, %zmm16
13989; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm27
13990; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm12, %zmm27
13991; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30
13992; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm14, %zmm30
13993; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm17
13994; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm5, %zmm17
13995; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm6
13996; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
13997; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm7
13998; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm8
13999; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm3, %zmm8
14000; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm1
14001; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm0
14002; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
14003; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
14004; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm4, %zmm9
14005; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
14006; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm13
14007; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm12, %zmm13
14008; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
14009; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm18
14010; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm14, %zmm18
14011; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
14012; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11
14013; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm5, %zmm11
14014; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm5
14015; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
14016; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm24, %zmm10
14017; AVX512DQ-BW-FCP-NEXT:    movb $56, %al
14018; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
14019; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14020; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
14021; AVX512DQ-BW-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
14022; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
14023; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14024; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
14025; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14026; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
14027; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14028; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
14029; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14030; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
14031; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14032; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
14033; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14034; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
14035; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14036; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
14037; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14038; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
14039; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14040; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
14041; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14042; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
14043; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14044; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm21 {%k1}
14045; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14046; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
14047; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14048; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
14049; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14050; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
14051; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14052; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
14053; AVX512DQ-BW-FCP-NEXT:    movw $31, %ax
14054; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
14055; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm13, %zmm28 {%k2}
14056; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14057; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm28 {%k1}
14058; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
14059; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14060; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
14061; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14062; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
14063; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm13
14064; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
14065; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
14066; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14067; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
14068; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm24
14069; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm12, %zmm31 {%k2}
14070; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14071; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm31 {%k1}
14072; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm29 {%k2}
14073; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14074; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
14075; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
14076; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
14077; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
14078; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14079; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
14080; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
14081; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm30, %zmm12 {%k2}
14082; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14083; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
14084; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm14, %zmm25 {%k2}
14085; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14086; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k1}
14087; AVX512DQ-BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
14088; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
14089; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14090; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
14091; AVX512DQ-BW-FCP-NEXT:    movb $-32, %al
14092; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
14093; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14094; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
14095; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14096; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
14097; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14098; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
14099; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14100; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm17 {%k1}
14101; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14102; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
14103; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14104; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
14105; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14106; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
14107; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14108; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
14109; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14110; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
14111; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm6 {%k1}
14112; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14113; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
14114; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm1 {%k1}
14115; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14116; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
14117; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm10 {%k1}
14118; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14119; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
14120; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rsi)
14121; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
14122; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rsi)
14123; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rsi)
14124; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
14125; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
14126; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rdx)
14127; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
14128; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, 192(%rcx)
14129; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, (%rcx)
14130; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rcx)
14131; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, 128(%rcx)
14132; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 192(%r8)
14133; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%r8)
14134; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%r8)
14135; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, 128(%r8)
14136; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 192(%r9)
14137; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, (%r9)
14138; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, 64(%r9)
14139; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, 128(%r9)
14140; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14141; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
14142; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
14143; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
14144; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
14145; AVX512DQ-BW-FCP-NEXT:    addq $2632, %rsp # imm = 0xA48
14146; AVX512DQ-BW-FCP-NEXT:    vzeroupper
14147; AVX512DQ-BW-FCP-NEXT:    retq
14148  %wide.vec = load <384 x i32>, ptr %in.vec, align 64
14149  %strided.vec0 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378>
14150  %strided.vec1 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187, i32 193, i32 199, i32 205, i32 211, i32 217, i32 223, i32 229, i32 235, i32 241, i32 247, i32 253, i32 259, i32 265, i32 271, i32 277, i32 283, i32 289, i32 295, i32 301, i32 307, i32 313, i32 319, i32 325, i32 331, i32 337, i32 343, i32 349, i32 355, i32 361, i32 367, i32 373, i32 379>
14151  %strided.vec2 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188, i32 194, i32 200, i32 206, i32 212, i32 218, i32 224, i32 230, i32 236, i32 242, i32 248, i32 254, i32 260, i32 266, i32 272, i32 278, i32 284, i32 290, i32 296, i32 302, i32 308, i32 314, i32 320, i32 326, i32 332, i32 338, i32 344, i32 350, i32 356, i32 362, i32 368, i32 374, i32 380>
14152  %strided.vec3 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189, i32 195, i32 201, i32 207, i32 213, i32 219, i32 225, i32 231, i32 237, i32 243, i32 249, i32 255, i32 261, i32 267, i32 273, i32 279, i32 285, i32 291, i32 297, i32 303, i32 309, i32 315, i32 321, i32 327, i32 333, i32 339, i32 345, i32 351, i32 357, i32 363, i32 369, i32 375, i32 381>
14153  %strided.vec4 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190, i32 196, i32 202, i32 208, i32 214, i32 220, i32 226, i32 232, i32 238, i32 244, i32 250, i32 256, i32 262, i32 268, i32 274, i32 280, i32 286, i32 292, i32 298, i32 304, i32 310, i32 316, i32 322, i32 328, i32 334, i32 340, i32 346, i32 352, i32 358, i32 364, i32 370, i32 376, i32 382>
14154  %strided.vec5 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191, i32 197, i32 203, i32 209, i32 215, i32 221, i32 227, i32 233, i32 239, i32 245, i32 251, i32 257, i32 263, i32 269, i32 275, i32 281, i32 287, i32 293, i32 299, i32 305, i32 311, i32 317, i32 323, i32 329, i32 335, i32 341, i32 347, i32 353, i32 359, i32 365, i32 371, i32 377, i32 383>
14155  store <64 x i32> %strided.vec0, ptr %out.vec0, align 64
14156  store <64 x i32> %strided.vec1, ptr %out.vec1, align 64
14157  store <64 x i32> %strided.vec2, ptr %out.vec2, align 64
14158  store <64 x i32> %strided.vec3, ptr %out.vec3, align 64
14159  store <64 x i32> %strided.vec4, ptr %out.vec4, align 64
14160  store <64 x i32> %strided.vec5, ptr %out.vec5, align 64
14161  ret void
14162}
14163