xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll (revision 7457f51f6cf61b960e3e6e45e63378debd5c1d5c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
19; SSE-LABEL: load_i32_stride8_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
23; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
24; SSE-NEXT:    movdqa (%rdi), %xmm0
25; SSE-NEXT:    movdqa 16(%rdi), %xmm1
26; SSE-NEXT:    movdqa 32(%rdi), %xmm2
27; SSE-NEXT:    movdqa 48(%rdi), %xmm3
28; SSE-NEXT:    movdqa %xmm0, %xmm4
29; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
30; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
31; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
32; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
33; SSE-NEXT:    movdqa %xmm1, %xmm6
34; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
35; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
36; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
37; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
38; SSE-NEXT:    movq %xmm4, (%rsi)
39; SSE-NEXT:    movq %xmm5, (%rdx)
40; SSE-NEXT:    movq %xmm0, (%rcx)
41; SSE-NEXT:    movq %xmm2, (%r8)
42; SSE-NEXT:    movq %xmm6, (%r9)
43; SSE-NEXT:    movq %xmm7, (%r11)
44; SSE-NEXT:    movq %xmm1, (%r10)
45; SSE-NEXT:    movq %xmm3, (%rax)
46; SSE-NEXT:    retq
47;
48; AVX-LABEL: load_i32_stride8_vf2:
49; AVX:       # %bb.0:
50; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
51; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
52; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r11
53; AVX-NEXT:    vmovaps (%rdi), %ymm0
54; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
55; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
56; AVX-NEXT:    vmovdqa (%rdi), %xmm3
57; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
58; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1]
59; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7]
60; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
61; AVX-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
62; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
63; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
64; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
65; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3]
66; AVX-NEXT:    vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
67; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
68; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
69; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
70; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
71; AVX-NEXT:    vmovq %xmm4, (%rsi)
72; AVX-NEXT:    vmovq %xmm5, (%rdx)
73; AVX-NEXT:    vmovq %xmm2, (%rcx)
74; AVX-NEXT:    vpextrq $1, %xmm2, (%r8)
75; AVX-NEXT:    vmovlps %xmm3, (%r9)
76; AVX-NEXT:    vmovlps %xmm6, (%r11)
77; AVX-NEXT:    vmovlps %xmm7, (%r10)
78; AVX-NEXT:    vmovlps %xmm0, (%rax)
79; AVX-NEXT:    vzeroupper
80; AVX-NEXT:    retq
81;
82; AVX2-LABEL: load_i32_stride8_vf2:
83; AVX2:       # %bb.0:
84; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
85; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
86; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
87; AVX2-NEXT:    vmovaps 32(%rdi), %ymm0
88; AVX2-NEXT:    vmovaps (%rdi), %ymm1
89; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
90; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm3
91; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
92; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
93; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
94; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
95; AVX2-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
96; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
97; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
98; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
99; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
100; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
101; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
102; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
103; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
104; AVX2-NEXT:    vmovq %xmm4, (%rsi)
105; AVX2-NEXT:    vmovq %xmm5, (%rdx)
106; AVX2-NEXT:    vmovq %xmm2, (%rcx)
107; AVX2-NEXT:    vpextrq $1, %xmm2, (%r8)
108; AVX2-NEXT:    vmovlps %xmm3, (%r9)
109; AVX2-NEXT:    vmovlps %xmm6, (%r11)
110; AVX2-NEXT:    vmovlps %xmm1, (%r10)
111; AVX2-NEXT:    vmovlps %xmm0, (%rax)
112; AVX2-NEXT:    vzeroupper
113; AVX2-NEXT:    retq
114;
115; AVX2-FP-LABEL: load_i32_stride8_vf2:
116; AVX2-FP:       # %bb.0:
117; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
118; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
119; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
120; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm0
121; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
122; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm2
123; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm3
124; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
125; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
126; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
127; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
128; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
129; AVX2-FP-NEXT:    vextractf128 $1, %ymm3, %xmm3
130; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
131; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
132; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
133; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
134; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm1
135; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
136; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
137; AVX2-FP-NEXT:    vmovq %xmm4, (%rsi)
138; AVX2-FP-NEXT:    vmovq %xmm5, (%rdx)
139; AVX2-FP-NEXT:    vmovq %xmm2, (%rcx)
140; AVX2-FP-NEXT:    vpextrq $1, %xmm2, (%r8)
141; AVX2-FP-NEXT:    vmovlps %xmm3, (%r9)
142; AVX2-FP-NEXT:    vmovlps %xmm6, (%r11)
143; AVX2-FP-NEXT:    vmovlps %xmm1, (%r10)
144; AVX2-FP-NEXT:    vmovlps %xmm0, (%rax)
145; AVX2-FP-NEXT:    vzeroupper
146; AVX2-FP-NEXT:    retq
147;
148; AVX2-FCP-LABEL: load_i32_stride8_vf2:
149; AVX2-FCP:       # %bb.0:
150; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
151; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
152; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
153; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm0
154; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
155; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm2
156; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
157; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
158; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
159; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
160; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
161; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
162; AVX2-FCP-NEXT:    vextractf128 $1, %ymm3, %xmm3
163; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
164; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
165; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
166; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
167; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
168; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
169; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
170; AVX2-FCP-NEXT:    vmovq %xmm4, (%rsi)
171; AVX2-FCP-NEXT:    vmovq %xmm5, (%rdx)
172; AVX2-FCP-NEXT:    vmovq %xmm2, (%rcx)
173; AVX2-FCP-NEXT:    vpextrq $1, %xmm2, (%r8)
174; AVX2-FCP-NEXT:    vmovlps %xmm3, (%r9)
175; AVX2-FCP-NEXT:    vmovlps %xmm6, (%r11)
176; AVX2-FCP-NEXT:    vmovlps %xmm1, (%r10)
177; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rax)
178; AVX2-FCP-NEXT:    vzeroupper
179; AVX2-FCP-NEXT:    retq
180;
181; AVX512-LABEL: load_i32_stride8_vf2:
182; AVX512:       # %bb.0:
183; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
184; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
185; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
186; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
187; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
188; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
189; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
190; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
191; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
192; AVX512-NEXT:    vmovaps 32(%rdi), %ymm1
193; AVX512-NEXT:    vmovaps (%rdi), %ymm4
194; AVX512-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
195; AVX512-NEXT:    vextractf128 $1, %ymm5, %xmm5
196; AVX512-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
197; AVX512-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
198; AVX512-NEXT:    vextractf128 $1, %ymm6, %xmm6
199; AVX512-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
200; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm4
201; AVX512-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
202; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
203; AVX512-NEXT:    vmovq %xmm2, (%rsi)
204; AVX512-NEXT:    vmovq %xmm3, (%rdx)
205; AVX512-NEXT:    vmovq %xmm0, (%rcx)
206; AVX512-NEXT:    vpextrq $1, %xmm0, (%r8)
207; AVX512-NEXT:    vmovlps %xmm5, (%r9)
208; AVX512-NEXT:    vmovlps %xmm6, (%r11)
209; AVX512-NEXT:    vmovlps %xmm4, (%r10)
210; AVX512-NEXT:    vmovlps %xmm1, (%rax)
211; AVX512-NEXT:    vzeroupper
212; AVX512-NEXT:    retq
213;
214; AVX512-FCP-LABEL: load_i32_stride8_vf2:
215; AVX512-FCP:       # %bb.0:
216; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
217; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
218; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
219; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
220; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
221; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
222; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
223; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
224; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
225; AVX512-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
226; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm4
227; AVX512-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
228; AVX512-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
229; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
230; AVX512-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
231; AVX512-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
232; AVX512-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
233; AVX512-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
234; AVX512-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
235; AVX512-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
236; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
237; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
238; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
239; AVX512-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
240; AVX512-FCP-NEXT:    vmovlps %xmm5, (%r9)
241; AVX512-FCP-NEXT:    vmovlps %xmm6, (%r11)
242; AVX512-FCP-NEXT:    vmovlps %xmm4, (%r10)
243; AVX512-FCP-NEXT:    vmovlps %xmm1, (%rax)
244; AVX512-FCP-NEXT:    vzeroupper
245; AVX512-FCP-NEXT:    retq
246;
247; AVX512DQ-LABEL: load_i32_stride8_vf2:
248; AVX512DQ:       # %bb.0:
249; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
250; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
251; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r11
252; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
253; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
254; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
255; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
256; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
257; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
258; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm1
259; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm4
260; AVX512DQ-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
261; AVX512DQ-NEXT:    vextractf128 $1, %ymm5, %xmm5
262; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
263; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
264; AVX512DQ-NEXT:    vextractf128 $1, %ymm6, %xmm6
265; AVX512DQ-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
266; AVX512DQ-NEXT:    vextractf128 $1, %ymm1, %xmm4
267; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
268; AVX512DQ-NEXT:    vextractf128 $1, %ymm1, %xmm1
269; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
270; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
271; AVX512DQ-NEXT:    vmovq %xmm0, (%rcx)
272; AVX512DQ-NEXT:    vpextrq $1, %xmm0, (%r8)
273; AVX512DQ-NEXT:    vmovlps %xmm5, (%r9)
274; AVX512DQ-NEXT:    vmovlps %xmm6, (%r11)
275; AVX512DQ-NEXT:    vmovlps %xmm4, (%r10)
276; AVX512DQ-NEXT:    vmovlps %xmm1, (%rax)
277; AVX512DQ-NEXT:    vzeroupper
278; AVX512DQ-NEXT:    retq
279;
280; AVX512DQ-FCP-LABEL: load_i32_stride8_vf2:
281; AVX512DQ-FCP:       # %bb.0:
282; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
283; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
284; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
285; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
286; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
287; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
288; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
289; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
290; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
291; AVX512DQ-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
292; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm4
293; AVX512DQ-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
294; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
295; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
296; AVX512DQ-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
297; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
298; AVX512DQ-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
299; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
300; AVX512DQ-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
301; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
302; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
303; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
304; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
305; AVX512DQ-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
306; AVX512DQ-FCP-NEXT:    vmovlps %xmm5, (%r9)
307; AVX512DQ-FCP-NEXT:    vmovlps %xmm6, (%r11)
308; AVX512DQ-FCP-NEXT:    vmovlps %xmm4, (%r10)
309; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%rax)
310; AVX512DQ-FCP-NEXT:    vzeroupper
311; AVX512DQ-FCP-NEXT:    retq
312;
313; AVX512BW-LABEL: load_i32_stride8_vf2:
314; AVX512BW:       # %bb.0:
315; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
316; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
317; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
318; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
319; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm1
320; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
321; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
322; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
323; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
324; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm1
325; AVX512BW-NEXT:    vmovaps (%rdi), %ymm4
326; AVX512BW-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
327; AVX512BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
328; AVX512BW-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
329; AVX512BW-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
330; AVX512BW-NEXT:    vextractf128 $1, %ymm6, %xmm6
331; AVX512BW-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
332; AVX512BW-NEXT:    vextractf128 $1, %ymm1, %xmm4
333; AVX512BW-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
334; AVX512BW-NEXT:    vextractf128 $1, %ymm1, %xmm1
335; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
336; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
337; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
338; AVX512BW-NEXT:    vpextrq $1, %xmm0, (%r8)
339; AVX512BW-NEXT:    vmovlps %xmm5, (%r9)
340; AVX512BW-NEXT:    vmovlps %xmm6, (%r11)
341; AVX512BW-NEXT:    vmovlps %xmm4, (%r10)
342; AVX512BW-NEXT:    vmovlps %xmm1, (%rax)
343; AVX512BW-NEXT:    vzeroupper
344; AVX512BW-NEXT:    retq
345;
346; AVX512BW-FCP-LABEL: load_i32_stride8_vf2:
347; AVX512BW-FCP:       # %bb.0:
348; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
349; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
350; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
351; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
352; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
353; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
354; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
355; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
356; AVX512BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
357; AVX512BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
358; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
359; AVX512BW-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
360; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
361; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
362; AVX512BW-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
363; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
364; AVX512BW-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
365; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
366; AVX512BW-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
367; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
368; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
369; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
370; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
371; AVX512BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
372; AVX512BW-FCP-NEXT:    vmovlps %xmm5, (%r9)
373; AVX512BW-FCP-NEXT:    vmovlps %xmm6, (%r11)
374; AVX512BW-FCP-NEXT:    vmovlps %xmm4, (%r10)
375; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
376; AVX512BW-FCP-NEXT:    vzeroupper
377; AVX512BW-FCP-NEXT:    retq
378;
379; AVX512DQ-BW-LABEL: load_i32_stride8_vf2:
380; AVX512DQ-BW:       # %bb.0:
381; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
382; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
383; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
384; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
385; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm1
386; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
387; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
388; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
389; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
390; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm1
391; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm4
392; AVX512DQ-BW-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
393; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
394; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
395; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
396; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm6, %xmm6
397; AVX512DQ-BW-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
398; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm1, %xmm4
399; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
400; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm1, %xmm1
401; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
402; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
403; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
404; AVX512DQ-BW-NEXT:    vpextrq $1, %xmm0, (%r8)
405; AVX512DQ-BW-NEXT:    vmovlps %xmm5, (%r9)
406; AVX512DQ-BW-NEXT:    vmovlps %xmm6, (%r11)
407; AVX512DQ-BW-NEXT:    vmovlps %xmm4, (%r10)
408; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%rax)
409; AVX512DQ-BW-NEXT:    vzeroupper
410; AVX512DQ-BW-NEXT:    retq
411;
412; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf2:
413; AVX512DQ-BW-FCP:       # %bb.0:
414; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
415; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
416; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
417; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
418; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
419; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
420; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
421; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
422; AVX512DQ-BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
423; AVX512DQ-BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
424; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
425; AVX512DQ-BW-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
426; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
427; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
428; AVX512DQ-BW-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
429; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
430; AVX512DQ-BW-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
431; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
432; AVX512DQ-BW-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
433; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
434; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
435; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
436; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
437; AVX512DQ-BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
438; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm5, (%r9)
439; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm6, (%r11)
440; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm4, (%r10)
441; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
442; AVX512DQ-BW-FCP-NEXT:    vzeroupper
443; AVX512DQ-BW-FCP-NEXT:    retq
444  %wide.vec = load <16 x i32>, ptr %in.vec, align 64
445  %strided.vec0 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 0, i32 8>
446  %strided.vec1 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 1, i32 9>
447  %strided.vec2 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 2, i32 10>
448  %strided.vec3 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 3, i32 11>
449  %strided.vec4 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 4, i32 12>
450  %strided.vec5 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 5, i32 13>
451  %strided.vec6 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 6, i32 14>
452  %strided.vec7 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 7, i32 15>
453  store <2 x i32> %strided.vec0, ptr %out.vec0, align 64
454  store <2 x i32> %strided.vec1, ptr %out.vec1, align 64
455  store <2 x i32> %strided.vec2, ptr %out.vec2, align 64
456  store <2 x i32> %strided.vec3, ptr %out.vec3, align 64
457  store <2 x i32> %strided.vec4, ptr %out.vec4, align 64
458  store <2 x i32> %strided.vec5, ptr %out.vec5, align 64
459  store <2 x i32> %strided.vec6, ptr %out.vec6, align 64
460  store <2 x i32> %strided.vec7, ptr %out.vec7, align 64
461  ret void
462}
463
464define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
465; SSE-LABEL: load_i32_stride8_vf4:
466; SSE:       # %bb.0:
467; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
468; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
469; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
470; SSE-NEXT:    movaps 112(%rdi), %xmm3
471; SSE-NEXT:    movaps 80(%rdi), %xmm2
472; SSE-NEXT:    movaps (%rdi), %xmm1
473; SSE-NEXT:    movaps 16(%rdi), %xmm0
474; SSE-NEXT:    movaps 32(%rdi), %xmm4
475; SSE-NEXT:    movaps 48(%rdi), %xmm5
476; SSE-NEXT:    movaps 96(%rdi), %xmm6
477; SSE-NEXT:    movaps 64(%rdi), %xmm7
478; SSE-NEXT:    movaps %xmm7, %xmm8
479; SSE-NEXT:    unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
480; SSE-NEXT:    movaps %xmm1, %xmm9
481; SSE-NEXT:    unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
482; SSE-NEXT:    movaps %xmm9, %xmm10
483; SSE-NEXT:    movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0]
484; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1]
485; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
486; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
487; SSE-NEXT:    movaps %xmm1, %xmm4
488; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0]
489; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
490; SSE-NEXT:    movaps %xmm2, %xmm6
491; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
492; SSE-NEXT:    movaps %xmm0, %xmm7
493; SSE-NEXT:    unpcklps {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
494; SSE-NEXT:    movaps %xmm7, %xmm8
495; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
496; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1]
497; SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
498; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
499; SSE-NEXT:    movaps %xmm0, %xmm3
500; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
501; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
502; SSE-NEXT:    movaps %xmm10, (%rsi)
503; SSE-NEXT:    movaps %xmm9, (%rdx)
504; SSE-NEXT:    movaps %xmm4, (%rcx)
505; SSE-NEXT:    movaps %xmm1, (%r8)
506; SSE-NEXT:    movaps %xmm8, (%r9)
507; SSE-NEXT:    movaps %xmm7, (%r11)
508; SSE-NEXT:    movaps %xmm3, (%r10)
509; SSE-NEXT:    movaps %xmm0, (%rax)
510; SSE-NEXT:    retq
511;
512; AVX-LABEL: load_i32_stride8_vf4:
513; AVX:       # %bb.0:
514; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
515; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
516; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r11
517; AVX-NEXT:    vmovaps (%rdi), %ymm0
518; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
519; AVX-NEXT:    vmovaps 64(%rdi), %ymm2
520; AVX-NEXT:    vmovaps 96(%rdi), %ymm3
521; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
522; AVX-NEXT:    vmovaps (%rdi), %xmm5
523; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
524; AVX-NEXT:    vmovaps 96(%rdi), %xmm7
525; AVX-NEXT:    vmovaps 64(%rdi), %xmm8
526; AVX-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
527; AVX-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0]
528; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1]
529; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3]
530; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
531; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm7[2,2,2,2]
532; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
533; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
534; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm10[2,3]
535; AVX-NEXT:    vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
536; AVX-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1]
537; AVX-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
538; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
539; AVX-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
540; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
541; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,0]
542; AVX-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
543; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
544; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
545; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
546; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[2,3]
547; AVX-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
548; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
549; AVX-NEXT:    vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
550; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm11
551; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,0]
552; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
553; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
554; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
555; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
556; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
557; AVX-NEXT:    vmovaps %xmm6, (%rsi)
558; AVX-NEXT:    vmovaps %xmm9, (%rdx)
559; AVX-NEXT:    vmovaps %xmm5, (%rcx)
560; AVX-NEXT:    vmovaps %xmm4, (%r8)
561; AVX-NEXT:    vmovaps %xmm7, (%r9)
562; AVX-NEXT:    vmovaps %xmm8, (%r11)
563; AVX-NEXT:    vmovaps %xmm10, (%r10)
564; AVX-NEXT:    vmovaps %xmm0, (%rax)
565; AVX-NEXT:    vzeroupper
566; AVX-NEXT:    retq
567;
568; AVX2-LABEL: load_i32_stride8_vf4:
569; AVX2:       # %bb.0:
570; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
571; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
572; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
573; AVX2-NEXT:    vmovaps 96(%rdi), %ymm0
574; AVX2-NEXT:    vmovaps 64(%rdi), %ymm1
575; AVX2-NEXT:    vmovaps 32(%rdi), %ymm2
576; AVX2-NEXT:    vmovaps (%rdi), %ymm3
577; AVX2-NEXT:    vmovaps 96(%rdi), %xmm4
578; AVX2-NEXT:    vbroadcastss %xmm4, %xmm5
579; AVX2-NEXT:    vmovaps (%rdi), %xmm6
580; AVX2-NEXT:    vmovaps 32(%rdi), %xmm7
581; AVX2-NEXT:    vmovaps 64(%rdi), %xmm8
582; AVX2-NEXT:    vbroadcastss %xmm8, %xmm9
583; AVX2-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
584; AVX2-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
585; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3]
586; AVX2-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
587; AVX2-NEXT:    vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1]
588; AVX2-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3]
589; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
590; AVX2-NEXT:    vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2]
591; AVX2-NEXT:    vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
592; AVX2-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
593; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3]
594; AVX2-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
595; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1]
596; AVX2-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
597; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
598; AVX2-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
599; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2]
600; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3]
601; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
602; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5]
603; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
604; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm10
605; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
606; AVX2-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
607; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
608; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
609; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
610; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
611; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
612; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
613; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
614; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
615; AVX2-NEXT:    vmovaps %xmm5, (%rsi)
616; AVX2-NEXT:    vmovaps %xmm9, (%rdx)
617; AVX2-NEXT:    vmovaps %xmm7, (%rcx)
618; AVX2-NEXT:    vmovaps %xmm4, (%r8)
619; AVX2-NEXT:    vmovaps %xmm6, (%r9)
620; AVX2-NEXT:    vmovaps %xmm8, (%r11)
621; AVX2-NEXT:    vmovaps %xmm1, (%r10)
622; AVX2-NEXT:    vmovaps %xmm0, (%rax)
623; AVX2-NEXT:    vzeroupper
624; AVX2-NEXT:    retq
625;
626; AVX2-FP-LABEL: load_i32_stride8_vf4:
627; AVX2-FP:       # %bb.0:
628; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
629; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
630; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
631; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm0
632; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm1
633; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm2
634; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm3
635; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm4
636; AVX2-FP-NEXT:    vbroadcastss %xmm4, %xmm5
637; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm6
638; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm7
639; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm8
640; AVX2-FP-NEXT:    vbroadcastss %xmm8, %xmm9
641; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
642; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
643; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3]
644; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
645; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1]
646; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3]
647; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
648; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2]
649; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
650; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
651; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3]
652; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
653; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1]
654; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
655; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
656; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
657; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2]
658; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3]
659; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
660; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5]
661; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
662; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm10
663; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
664; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
665; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm3
666; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
667; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
668; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
669; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
670; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
671; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
672; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
673; AVX2-FP-NEXT:    vmovaps %xmm5, (%rsi)
674; AVX2-FP-NEXT:    vmovaps %xmm9, (%rdx)
675; AVX2-FP-NEXT:    vmovaps %xmm7, (%rcx)
676; AVX2-FP-NEXT:    vmovaps %xmm4, (%r8)
677; AVX2-FP-NEXT:    vmovaps %xmm6, (%r9)
678; AVX2-FP-NEXT:    vmovaps %xmm8, (%r11)
679; AVX2-FP-NEXT:    vmovaps %xmm1, (%r10)
680; AVX2-FP-NEXT:    vmovaps %xmm0, (%rax)
681; AVX2-FP-NEXT:    vzeroupper
682; AVX2-FP-NEXT:    retq
683;
684; AVX2-FCP-LABEL: load_i32_stride8_vf4:
685; AVX2-FCP:       # %bb.0:
686; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
687; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
688; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
689; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm0
690; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm1
691; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm2
692; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm3
693; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm4
694; AVX2-FCP-NEXT:    vbroadcastss %xmm4, %xmm5
695; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm6
696; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm7
697; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm8
698; AVX2-FCP-NEXT:    vbroadcastss %xmm8, %xmm9
699; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
700; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
701; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3]
702; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
703; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1]
704; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3]
705; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
706; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2]
707; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
708; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
709; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3]
710; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
711; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1]
712; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
713; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
714; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
715; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2]
716; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3]
717; AVX2-FCP-NEXT:    vextractf128 $1, %ymm8, %xmm8
718; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5]
719; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
720; AVX2-FCP-NEXT:    vextractf128 $1, %ymm10, %xmm10
721; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
722; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
723; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm3
724; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
725; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
726; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
727; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
728; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
729; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
730; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
731; AVX2-FCP-NEXT:    vmovaps %xmm5, (%rsi)
732; AVX2-FCP-NEXT:    vmovaps %xmm9, (%rdx)
733; AVX2-FCP-NEXT:    vmovaps %xmm7, (%rcx)
734; AVX2-FCP-NEXT:    vmovaps %xmm4, (%r8)
735; AVX2-FCP-NEXT:    vmovaps %xmm6, (%r9)
736; AVX2-FCP-NEXT:    vmovaps %xmm8, (%r11)
737; AVX2-FCP-NEXT:    vmovaps %xmm1, (%r10)
738; AVX2-FCP-NEXT:    vmovaps %xmm0, (%rax)
739; AVX2-FCP-NEXT:    vzeroupper
740; AVX2-FCP-NEXT:    retq
741;
742; AVX512-LABEL: load_i32_stride8_vf4:
743; AVX512:       # %bb.0:
744; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
745; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
746; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
747; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
748; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm1
749; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm2
750; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
751; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
752; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
753; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
754; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
755; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
756; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm5
757; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
758; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
759; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
760; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
761; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
762; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
763; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
764; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
765; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
766; AVX512-NEXT:    vmovdqa %xmm3, (%rdx)
767; AVX512-NEXT:    vmovdqa %xmm4, (%rcx)
768; AVX512-NEXT:    vmovdqa %xmm5, (%r8)
769; AVX512-NEXT:    vmovdqa %xmm6, (%r9)
770; AVX512-NEXT:    vmovdqa %xmm7, (%r11)
771; AVX512-NEXT:    vmovdqa %xmm8, (%r10)
772; AVX512-NEXT:    vmovdqa %xmm9, (%rax)
773; AVX512-NEXT:    vzeroupper
774; AVX512-NEXT:    retq
775;
776; AVX512-FCP-LABEL: load_i32_stride8_vf4:
777; AVX512-FCP:       # %bb.0:
778; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
779; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
780; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
781; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
782; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
783; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
784; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
785; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
786; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
787; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
788; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
789; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
790; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm5
791; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
792; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
793; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
794; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
795; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
796; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
797; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
798; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
799; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
800; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
801; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
802; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%r8)
803; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r9)
804; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%r11)
805; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%r10)
806; AVX512-FCP-NEXT:    vmovdqa %xmm9, (%rax)
807; AVX512-FCP-NEXT:    vzeroupper
808; AVX512-FCP-NEXT:    retq
809;
810; AVX512DQ-LABEL: load_i32_stride8_vf4:
811; AVX512DQ:       # %bb.0:
812; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
813; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
814; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r11
815; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
816; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm1
817; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm2
818; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
819; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
820; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
821; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
822; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
823; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
824; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm5
825; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
826; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
827; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
828; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
829; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
830; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
831; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
832; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
833; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsi)
834; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rdx)
835; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rcx)
836; AVX512DQ-NEXT:    vmovdqa %xmm5, (%r8)
837; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r9)
838; AVX512DQ-NEXT:    vmovdqa %xmm7, (%r11)
839; AVX512DQ-NEXT:    vmovdqa %xmm8, (%r10)
840; AVX512DQ-NEXT:    vmovdqa %xmm9, (%rax)
841; AVX512DQ-NEXT:    vzeroupper
842; AVX512DQ-NEXT:    retq
843;
844; AVX512DQ-FCP-LABEL: load_i32_stride8_vf4:
845; AVX512DQ-FCP:       # %bb.0:
846; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
847; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
848; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
849; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
850; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
851; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
852; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
853; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
854; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
855; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
856; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
857; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
858; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm5
859; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
860; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
861; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
862; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
863; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
864; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
865; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
866; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
867; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
868; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
869; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
870; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%r8)
871; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r9)
872; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%r11)
873; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%r10)
874; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, (%rax)
875; AVX512DQ-FCP-NEXT:    vzeroupper
876; AVX512DQ-FCP-NEXT:    retq
877;
878; AVX512BW-LABEL: load_i32_stride8_vf4:
879; AVX512BW:       # %bb.0:
880; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
881; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
882; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
883; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
884; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
885; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
886; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
887; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
888; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
889; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
890; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
891; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
892; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm5
893; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
894; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
895; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
896; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
897; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
898; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
899; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
900; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
901; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
902; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
903; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
904; AVX512BW-NEXT:    vmovdqa %xmm5, (%r8)
905; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
906; AVX512BW-NEXT:    vmovdqa %xmm7, (%r11)
907; AVX512BW-NEXT:    vmovdqa %xmm8, (%r10)
908; AVX512BW-NEXT:    vmovdqa %xmm9, (%rax)
909; AVX512BW-NEXT:    vzeroupper
910; AVX512BW-NEXT:    retq
911;
912; AVX512BW-FCP-LABEL: load_i32_stride8_vf4:
913; AVX512BW-FCP:       # %bb.0:
914; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
915; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
916; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
917; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
918; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
919; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
920; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
921; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
922; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
923; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
924; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
925; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
926; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm5
927; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
928; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
929; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
930; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
931; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
932; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
933; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
934; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
935; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
936; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
937; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
938; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
939; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
940; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%r11)
941; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%r10)
942; AVX512BW-FCP-NEXT:    vmovdqa %xmm9, (%rax)
943; AVX512BW-FCP-NEXT:    vzeroupper
944; AVX512BW-FCP-NEXT:    retq
945;
946; AVX512DQ-BW-LABEL: load_i32_stride8_vf4:
947; AVX512DQ-BW:       # %bb.0:
948; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
949; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
950; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
951; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
952; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
953; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
954; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
955; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
956; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
957; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
958; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
959; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
960; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm5
961; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
962; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
963; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
964; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
965; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
966; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
967; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
968; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
969; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rsi)
970; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
971; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
972; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%r8)
973; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
974; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, (%r11)
975; AVX512DQ-BW-NEXT:    vmovdqa %xmm8, (%r10)
976; AVX512DQ-BW-NEXT:    vmovdqa %xmm9, (%rax)
977; AVX512DQ-BW-NEXT:    vzeroupper
978; AVX512DQ-BW-NEXT:    retq
979;
980; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf4:
981; AVX512DQ-BW-FCP:       # %bb.0:
982; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
983; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
984; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
985; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
986; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
987; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
988; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
989; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
990; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
991; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
992; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
993; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
994; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm5
995; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
996; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
997; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
998; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
999; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
1000; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
1001; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
1002; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
1003; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
1004; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
1005; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
1006; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
1007; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
1008; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%r11)
1009; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%r10)
1010; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm9, (%rax)
1011; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1012; AVX512DQ-BW-FCP-NEXT:    retq
1013  %wide.vec = load <32 x i32>, ptr %in.vec, align 64
1014  %strided.vec0 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
1015  %strided.vec1 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
1016  %strided.vec2 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
1017  %strided.vec3 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
1018  %strided.vec4 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
1019  %strided.vec5 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
1020  %strided.vec6 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
1021  %strided.vec7 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
1022  store <4 x i32> %strided.vec0, ptr %out.vec0, align 64
1023  store <4 x i32> %strided.vec1, ptr %out.vec1, align 64
1024  store <4 x i32> %strided.vec2, ptr %out.vec2, align 64
1025  store <4 x i32> %strided.vec3, ptr %out.vec3, align 64
1026  store <4 x i32> %strided.vec4, ptr %out.vec4, align 64
1027  store <4 x i32> %strided.vec5, ptr %out.vec5, align 64
1028  store <4 x i32> %strided.vec6, ptr %out.vec6, align 64
1029  store <4 x i32> %strided.vec7, ptr %out.vec7, align 64
1030  ret void
1031}
1032
1033define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
1034; SSE-LABEL: load_i32_stride8_vf8:
1035; SSE:       # %bb.0:
1036; SSE-NEXT:    movaps 112(%rdi), %xmm15
1037; SSE-NEXT:    movaps 176(%rdi), %xmm4
1038; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1039; SSE-NEXT:    movaps 144(%rdi), %xmm9
1040; SSE-NEXT:    movaps (%rdi), %xmm10
1041; SSE-NEXT:    movaps 32(%rdi), %xmm1
1042; SSE-NEXT:    movaps 96(%rdi), %xmm13
1043; SSE-NEXT:    movaps 64(%rdi), %xmm11
1044; SSE-NEXT:    movaps 160(%rdi), %xmm2
1045; SSE-NEXT:    movaps 128(%rdi), %xmm6
1046; SSE-NEXT:    movaps 224(%rdi), %xmm12
1047; SSE-NEXT:    movaps 192(%rdi), %xmm0
1048; SSE-NEXT:    movaps %xmm0, %xmm8
1049; SSE-NEXT:    unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1]
1050; SSE-NEXT:    movaps %xmm6, %xmm5
1051; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1052; SSE-NEXT:    movaps %xmm5, %xmm7
1053; SSE-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0]
1054; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1055; SSE-NEXT:    movaps %xmm11, %xmm14
1056; SSE-NEXT:    unpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1057; SSE-NEXT:    movaps %xmm10, %xmm7
1058; SSE-NEXT:    unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
1059; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1]
1060; SSE-NEXT:    movaps %xmm7, %xmm8
1061; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0]
1062; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1063; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1]
1064; SSE-NEXT:    movaps 240(%rdi), %xmm14
1065; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1066; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
1067; SSE-NEXT:    movaps 208(%rdi), %xmm12
1068; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
1069; SSE-NEXT:    movaps %xmm6, %xmm2
1070; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1071; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1072; SSE-NEXT:    unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
1073; SSE-NEXT:    unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
1074; SSE-NEXT:    movaps %xmm10, %xmm8
1075; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0]
1076; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
1077; SSE-NEXT:    unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1]
1078; SSE-NEXT:    movaps %xmm12, %xmm0
1079; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
1080; SSE-NEXT:    movaps %xmm9, %xmm11
1081; SSE-NEXT:    unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
1082; SSE-NEXT:    movaps %xmm11, %xmm13
1083; SSE-NEXT:    movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0]
1084; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
1085; SSE-NEXT:    movaps 80(%rdi), %xmm2
1086; SSE-NEXT:    movaps %xmm2, %xmm1
1087; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
1088; SSE-NEXT:    movaps 16(%rdi), %xmm0
1089; SSE-NEXT:    movaps 48(%rdi), %xmm3
1090; SSE-NEXT:    movaps %xmm0, %xmm14
1091; SSE-NEXT:    unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
1092; SSE-NEXT:    movaps %xmm14, %xmm4
1093; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0]
1094; SSE-NEXT:    unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1]
1095; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1096; SSE-NEXT:    # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
1097; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1098; SSE-NEXT:    # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
1099; SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3]
1100; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1101; SSE-NEXT:    movaps %xmm9, %xmm1
1102; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0]
1103; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1]
1104; SSE-NEXT:    movaps %xmm0, %xmm3
1105; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1106; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1107; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1108; SSE-NEXT:    movaps %xmm2, (%rsi)
1109; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1110; SSE-NEXT:    movaps %xmm2, 16(%rsi)
1111; SSE-NEXT:    movaps %xmm7, (%rdx)
1112; SSE-NEXT:    movaps %xmm5, 16(%rdx)
1113; SSE-NEXT:    movaps %xmm8, (%rcx)
1114; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1115; SSE-NEXT:    movaps %xmm2, 16(%rcx)
1116; SSE-NEXT:    movaps %xmm10, (%r8)
1117; SSE-NEXT:    movaps %xmm6, 16(%r8)
1118; SSE-NEXT:    movaps %xmm4, (%r9)
1119; SSE-NEXT:    movaps %xmm13, 16(%r9)
1120; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1121; SSE-NEXT:    movaps %xmm14, (%rax)
1122; SSE-NEXT:    movaps %xmm11, 16(%rax)
1123; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1124; SSE-NEXT:    movaps %xmm3, (%rax)
1125; SSE-NEXT:    movaps %xmm1, 16(%rax)
1126; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1127; SSE-NEXT:    movaps %xmm9, 16(%rax)
1128; SSE-NEXT:    movaps %xmm0, (%rax)
1129; SSE-NEXT:    retq
1130;
1131; AVX-LABEL: load_i32_stride8_vf8:
1132; AVX:       # %bb.0:
1133; AVX-NEXT:    vmovaps (%rdi), %ymm0
1134; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
1135; AVX-NEXT:    vmovaps 64(%rdi), %ymm2
1136; AVX-NEXT:    vmovaps 96(%rdi), %ymm3
1137; AVX-NEXT:    vmovaps 32(%rdi), %xmm8
1138; AVX-NEXT:    vmovaps (%rdi), %xmm11
1139; AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
1140; AVX-NEXT:    vmovaps 96(%rdi), %xmm9
1141; AVX-NEXT:    vmovaps 64(%rdi), %xmm10
1142; AVX-NEXT:    vmovaps 160(%rdi), %xmm14
1143; AVX-NEXT:    vmovaps 128(%rdi), %xmm15
1144; AVX-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1145; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
1146; AVX-NEXT:    vmovaps 224(%rdi), %xmm12
1147; AVX-NEXT:    vmovaps 192(%rdi), %xmm13
1148; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1149; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1]
1150; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
1151; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
1152; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
1153; AVX-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0]
1154; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
1155; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1156; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm11[1,1,1,1]
1157; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3]
1158; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3]
1159; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
1160; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1]
1161; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3]
1162; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
1163; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7]
1164; AVX-NEXT:    vmovaps 160(%rdi), %ymm6
1165; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
1166; AVX-NEXT:    vmovaps 128(%rdi), %ymm7
1167; AVX-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
1168; AVX-NEXT:    vunpckhps {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1169; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm11
1170; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2]
1171; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3]
1172; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
1173; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7]
1174; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2]
1175; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3]
1176; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm8[0,1],xmm14[2,3]
1177; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm11[4,5,6,7]
1178; AVX-NEXT:    vmovaps 192(%rdi), %ymm11
1179; AVX-NEXT:    vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
1180; AVX-NEXT:    vmovaps 224(%rdi), %ymm10
1181; AVX-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1]
1182; AVX-NEXT:    vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
1183; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
1184; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm15[2,3,2,3]
1185; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
1186; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
1187; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7]
1188; AVX-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
1189; AVX-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
1190; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4]
1191; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
1192; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
1193; AVX-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
1194; AVX-NEXT:    vextractf128 $1, %ymm13, %xmm13
1195; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,0]
1196; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7]
1197; AVX-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5]
1198; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm6[1,0],ymm7[1,0],ymm6[5,4],ymm7[5,4]
1199; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm13[2,0],ymm8[2,3],ymm13[6,4],ymm8[6,7]
1200; AVX-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
1201; AVX-NEXT:    vextractf128 $1, %ymm13, %xmm13
1202; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
1203; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
1204; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm15[2,0],xmm13[2,3]
1205; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
1206; AVX-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
1207; AVX-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
1208; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,0],ymm15[4,5],ymm13[6,4]
1209; AVX-NEXT:    vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
1210; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
1211; AVX-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
1212; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
1213; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0]
1214; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7]
1215; AVX-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7]
1216; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[3,0],ymm6[7,4],ymm7[7,4]
1217; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm10[2,3],ymm6[6,4],ymm10[6,7]
1218; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
1219; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
1220; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1221; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1222; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1223; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
1224; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1225; AVX-NEXT:    vmovaps %ymm1, (%rsi)
1226; AVX-NEXT:    vmovaps %ymm5, (%rdx)
1227; AVX-NEXT:    vmovaps %ymm14, (%rcx)
1228; AVX-NEXT:    vmovaps %ymm9, (%r8)
1229; AVX-NEXT:    vmovaps %ymm12, (%r9)
1230; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1231; AVX-NEXT:    vmovaps %ymm8, (%rax)
1232; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1233; AVX-NEXT:    vmovaps %ymm4, (%rax)
1234; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1235; AVX-NEXT:    vmovaps %ymm0, (%rax)
1236; AVX-NEXT:    vzeroupper
1237; AVX-NEXT:    retq
1238;
1239; AVX2-LABEL: load_i32_stride8_vf8:
1240; AVX2:       # %bb.0:
1241; AVX2-NEXT:    vmovaps 96(%rdi), %ymm0
1242; AVX2-NEXT:    vmovaps 64(%rdi), %ymm1
1243; AVX2-NEXT:    vmovaps 32(%rdi), %ymm2
1244; AVX2-NEXT:    vmovaps (%rdi), %ymm3
1245; AVX2-NEXT:    vmovaps 160(%rdi), %xmm7
1246; AVX2-NEXT:    vmovaps 128(%rdi), %xmm11
1247; AVX2-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
1248; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
1249; AVX2-NEXT:    vmovaps 224(%rdi), %xmm8
1250; AVX2-NEXT:    vbroadcastss %xmm8, %xmm5
1251; AVX2-NEXT:    vmovaps 192(%rdi), %xmm10
1252; AVX2-NEXT:    vbroadcastss %xmm10, %xmm6
1253; AVX2-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1254; AVX2-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
1255; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1256; AVX2-NEXT:    vmovaps 96(%rdi), %xmm9
1257; AVX2-NEXT:    vbroadcastss %xmm9, %xmm5
1258; AVX2-NEXT:    vmovaps (%rdi), %xmm13
1259; AVX2-NEXT:    vmovaps 32(%rdi), %xmm14
1260; AVX2-NEXT:    vmovaps 64(%rdi), %xmm12
1261; AVX2-NEXT:    vbroadcastss %xmm12, %xmm6
1262; AVX2-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1263; AVX2-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
1264; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1265; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1266; AVX2-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
1267; AVX2-NEXT:    vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1]
1268; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
1269; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1270; AVX2-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
1271; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
1272; AVX2-NEXT:    vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1]
1273; AVX2-NEXT:    vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
1274; AVX2-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
1275; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7]
1276; AVX2-NEXT:    vmovaps 224(%rdi), %ymm6
1277; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
1278; AVX2-NEXT:    vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1279; AVX2-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm7
1280; AVX2-NEXT:    vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2]
1281; AVX2-NEXT:    vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
1282; AVX2-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
1283; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
1284; AVX2-NEXT:    vmovaps 192(%rdi), %ymm11
1285; AVX2-NEXT:    vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
1286; AVX2-NEXT:    vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2]
1287; AVX2-NEXT:    vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3]
1288; AVX2-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
1289; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
1290; AVX2-NEXT:    vmovaps 160(%rdi), %ymm13
1291; AVX2-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1292; AVX2-NEXT:    vmovaps 128(%rdi), %ymm10
1293; AVX2-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
1294; AVX2-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3]
1295; AVX2-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
1296; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
1297; AVX2-NEXT:    vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1298; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1]
1299; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1300; AVX2-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
1301; AVX2-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
1302; AVX2-NEXT:    vextractf128 $1, %ymm12, %xmm12
1303; AVX2-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
1304; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
1305; AVX2-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
1306; AVX2-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5]
1307; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2]
1308; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
1309; AVX2-NEXT:    vbroadcastss 148(%rdi), %ymm12
1310; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
1311; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7]
1312; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm14
1313; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
1314; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
1315; AVX2-NEXT:    vextractf128 $1, %ymm15, %xmm15
1316; AVX2-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1317; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1318; AVX2-NEXT:    vbroadcastss 248(%rdi), %ymm14
1319; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7]
1320; AVX2-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7]
1321; AVX2-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
1322; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
1323; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
1324; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
1325; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1326; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7]
1327; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1328; AVX2-NEXT:    vbroadcastss 220(%rdi), %ymm3
1329; AVX2-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7]
1330; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
1331; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1332; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
1333; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
1334; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1335; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1336; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
1337; AVX2-NEXT:    vmovaps %ymm5, (%rdx)
1338; AVX2-NEXT:    vmovaps %ymm7, (%rcx)
1339; AVX2-NEXT:    vmovaps %ymm8, (%r8)
1340; AVX2-NEXT:    vmovaps %ymm9, (%r9)
1341; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1342; AVX2-NEXT:    vmovaps %ymm12, (%rax)
1343; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1344; AVX2-NEXT:    vmovaps %ymm1, (%rax)
1345; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1346; AVX2-NEXT:    vmovaps %ymm0, (%rax)
1347; AVX2-NEXT:    vzeroupper
1348; AVX2-NEXT:    retq
1349;
1350; AVX2-FP-LABEL: load_i32_stride8_vf8:
1351; AVX2-FP:       # %bb.0:
1352; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm0
1353; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm1
1354; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm2
1355; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm3
1356; AVX2-FP-NEXT:    vmovaps 160(%rdi), %xmm7
1357; AVX2-FP-NEXT:    vmovaps 128(%rdi), %xmm11
1358; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
1359; AVX2-FP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
1360; AVX2-FP-NEXT:    vmovaps 224(%rdi), %xmm8
1361; AVX2-FP-NEXT:    vbroadcastss %xmm8, %xmm5
1362; AVX2-FP-NEXT:    vmovaps 192(%rdi), %xmm10
1363; AVX2-FP-NEXT:    vbroadcastss %xmm10, %xmm6
1364; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1365; AVX2-FP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
1366; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1367; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm9
1368; AVX2-FP-NEXT:    vbroadcastss %xmm9, %xmm5
1369; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm13
1370; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm14
1371; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm12
1372; AVX2-FP-NEXT:    vbroadcastss %xmm12, %xmm6
1373; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1374; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
1375; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1376; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1377; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
1378; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1]
1379; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
1380; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1381; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
1382; AVX2-FP-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
1383; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1]
1384; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
1385; AVX2-FP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
1386; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7]
1387; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm6
1388; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
1389; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1390; AVX2-FP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm7
1391; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2]
1392; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
1393; AVX2-FP-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
1394; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
1395; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm11
1396; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
1397; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2]
1398; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3]
1399; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
1400; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
1401; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm13
1402; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1403; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm10
1404; AVX2-FP-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
1405; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3]
1406; AVX2-FP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
1407; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
1408; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1409; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1]
1410; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1411; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
1412; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
1413; AVX2-FP-NEXT:    vextractf128 $1, %ymm12, %xmm12
1414; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
1415; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
1416; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
1417; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5]
1418; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2]
1419; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
1420; AVX2-FP-NEXT:    vbroadcastss 148(%rdi), %ymm12
1421; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
1422; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7]
1423; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm14
1424; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
1425; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
1426; AVX2-FP-NEXT:    vextractf128 $1, %ymm15, %xmm15
1427; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1428; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1429; AVX2-FP-NEXT:    vbroadcastss 248(%rdi), %ymm14
1430; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7]
1431; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7]
1432; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
1433; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm3
1434; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
1435; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
1436; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1437; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7]
1438; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1439; AVX2-FP-NEXT:    vbroadcastss 220(%rdi), %ymm3
1440; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7]
1441; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
1442; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
1443; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
1444; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
1445; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1446; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1447; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
1448; AVX2-FP-NEXT:    vmovaps %ymm5, (%rdx)
1449; AVX2-FP-NEXT:    vmovaps %ymm7, (%rcx)
1450; AVX2-FP-NEXT:    vmovaps %ymm8, (%r8)
1451; AVX2-FP-NEXT:    vmovaps %ymm9, (%r9)
1452; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1453; AVX2-FP-NEXT:    vmovaps %ymm12, (%rax)
1454; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1455; AVX2-FP-NEXT:    vmovaps %ymm1, (%rax)
1456; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1457; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
1458; AVX2-FP-NEXT:    vzeroupper
1459; AVX2-FP-NEXT:    retq
1460;
1461; AVX2-FCP-LABEL: load_i32_stride8_vf8:
1462; AVX2-FCP:       # %bb.0:
1463; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm0
1464; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm1
1465; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm2
1466; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm3
1467; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %xmm7
1468; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %xmm11
1469; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
1470; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
1471; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %xmm8
1472; AVX2-FCP-NEXT:    vbroadcastss %xmm8, %xmm5
1473; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %xmm10
1474; AVX2-FCP-NEXT:    vbroadcastss %xmm10, %xmm6
1475; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1476; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
1477; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1478; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm9
1479; AVX2-FCP-NEXT:    vbroadcastss %xmm9, %xmm5
1480; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm13
1481; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm14
1482; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm12
1483; AVX2-FCP-NEXT:    vbroadcastss %xmm12, %xmm6
1484; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1485; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
1486; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1487; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1488; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
1489; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1]
1490; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
1491; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1492; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
1493; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
1494; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1]
1495; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
1496; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
1497; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7]
1498; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm6
1499; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
1500; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1501; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm7
1502; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2]
1503; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
1504; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
1505; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
1506; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm11
1507; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
1508; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2]
1509; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3]
1510; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
1511; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
1512; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm13
1513; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1514; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm10
1515; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
1516; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3]
1517; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
1518; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
1519; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1520; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1]
1521; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1522; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
1523; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
1524; AVX2-FCP-NEXT:    vextractf128 $1, %ymm12, %xmm12
1525; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
1526; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
1527; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
1528; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5]
1529; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2]
1530; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
1531; AVX2-FCP-NEXT:    vbroadcastss 148(%rdi), %ymm12
1532; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
1533; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7]
1534; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm14
1535; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
1536; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
1537; AVX2-FCP-NEXT:    vextractf128 $1, %ymm15, %xmm15
1538; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1539; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1540; AVX2-FCP-NEXT:    vbroadcastss 248(%rdi), %ymm14
1541; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7]
1542; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7]
1543; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
1544; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm3
1545; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
1546; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
1547; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1548; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7]
1549; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1550; AVX2-FCP-NEXT:    vbroadcastss 220(%rdi), %ymm3
1551; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7]
1552; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
1553; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
1554; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
1555; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
1556; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1557; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1558; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
1559; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rdx)
1560; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rcx)
1561; AVX2-FCP-NEXT:    vmovaps %ymm8, (%r8)
1562; AVX2-FCP-NEXT:    vmovaps %ymm9, (%r9)
1563; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1564; AVX2-FCP-NEXT:    vmovaps %ymm12, (%rax)
1565; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1566; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rax)
1567; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1568; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
1569; AVX2-FCP-NEXT:    vzeroupper
1570; AVX2-FCP-NEXT:    retq
1571;
1572; AVX512-LABEL: load_i32_stride8_vf8:
1573; AVX512:       # %bb.0:
1574; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1575; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1576; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
1577; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
1578; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
1579; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm2
1580; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm3
1581; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1582; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1583; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1584; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1585; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1586; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1587; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1588; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1589; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1590; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1591; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1592; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1593; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1594; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
1595; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1596; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1597; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1598; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1599; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
1600; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1601; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1602; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1603; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1604; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1605; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1606; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1607; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1608; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1609; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1610; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1611; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1612; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1613; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1614; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
1615; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1616; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1617; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
1618; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1619; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1620; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1621; AVX512-NEXT:    vmovdqa %ymm4, (%rsi)
1622; AVX512-NEXT:    vmovdqa %ymm5, (%rdx)
1623; AVX512-NEXT:    vmovdqa %ymm6, (%rcx)
1624; AVX512-NEXT:    vmovdqa %ymm7, (%r8)
1625; AVX512-NEXT:    vmovdqa %ymm8, (%r9)
1626; AVX512-NEXT:    vmovdqa %ymm9, (%r11)
1627; AVX512-NEXT:    vmovdqa %ymm10, (%r10)
1628; AVX512-NEXT:    vmovdqa %ymm0, (%rax)
1629; AVX512-NEXT:    vzeroupper
1630; AVX512-NEXT:    retq
1631;
1632; AVX512-FCP-LABEL: load_i32_stride8_vf8:
1633; AVX512-FCP:       # %bb.0:
1634; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1635; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1636; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
1637; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1638; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1639; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1640; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
1641; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1642; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1643; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1644; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1645; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1646; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1647; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1648; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1649; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1650; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1651; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1652; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1653; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1654; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
1655; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1656; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1657; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1658; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1659; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
1660; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1661; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1662; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1663; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1664; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1665; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1666; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1667; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1668; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1669; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1670; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1671; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1672; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1673; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1674; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
1675; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1676; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1677; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
1678; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1679; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1680; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1681; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
1682; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
1683; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
1684; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r8)
1685; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r9)
1686; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%r11)
1687; AVX512-FCP-NEXT:    vmovdqa %ymm10, (%r10)
1688; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1689; AVX512-FCP-NEXT:    vzeroupper
1690; AVX512-FCP-NEXT:    retq
1691;
1692; AVX512DQ-LABEL: load_i32_stride8_vf8:
1693; AVX512DQ:       # %bb.0:
1694; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1695; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1696; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r11
1697; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
1698; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm1
1699; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm2
1700; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm3
1701; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1702; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1703; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1704; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1705; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1706; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1707; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1708; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1709; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1710; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1711; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1712; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1713; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1714; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
1715; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1716; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1717; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1718; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1719; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
1720; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1721; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1722; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1723; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1724; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1725; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1726; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1727; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1728; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1729; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1730; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1731; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1732; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1733; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1734; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
1735; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1736; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1737; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
1738; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1739; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1740; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1741; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rsi)
1742; AVX512DQ-NEXT:    vmovdqa %ymm5, (%rdx)
1743; AVX512DQ-NEXT:    vmovdqa %ymm6, (%rcx)
1744; AVX512DQ-NEXT:    vmovdqa %ymm7, (%r8)
1745; AVX512DQ-NEXT:    vmovdqa %ymm8, (%r9)
1746; AVX512DQ-NEXT:    vmovdqa %ymm9, (%r11)
1747; AVX512DQ-NEXT:    vmovdqa %ymm10, (%r10)
1748; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rax)
1749; AVX512DQ-NEXT:    vzeroupper
1750; AVX512DQ-NEXT:    retq
1751;
1752; AVX512DQ-FCP-LABEL: load_i32_stride8_vf8:
1753; AVX512DQ-FCP:       # %bb.0:
1754; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1755; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1756; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
1757; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1758; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1759; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1760; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
1761; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1762; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1763; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1764; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1765; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1766; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1767; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1768; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1769; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1770; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1771; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1772; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1773; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1774; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
1775; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1776; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1777; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1778; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1779; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
1780; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1781; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1782; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1783; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1784; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1785; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1786; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1787; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1788; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1789; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1790; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1791; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1792; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1793; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1794; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
1795; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1796; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1797; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
1798; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1799; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1800; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1801; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
1802; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
1803; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
1804; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r8)
1805; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r9)
1806; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%r11)
1807; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, (%r10)
1808; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1809; AVX512DQ-FCP-NEXT:    vzeroupper
1810; AVX512DQ-FCP-NEXT:    retq
1811;
1812; AVX512BW-LABEL: load_i32_stride8_vf8:
1813; AVX512BW:       # %bb.0:
1814; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1815; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1816; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
1817; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1818; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1819; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
1820; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
1821; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1822; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1823; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1824; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1825; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1826; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1827; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1828; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1829; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1830; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1831; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1832; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1833; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1834; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
1835; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1836; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1837; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1838; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1839; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
1840; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1841; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1842; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1843; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1844; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1845; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1846; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1847; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1848; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1849; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1850; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1851; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1852; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1853; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1854; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
1855; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1856; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1857; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
1858; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1859; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1860; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1861; AVX512BW-NEXT:    vmovdqa %ymm4, (%rsi)
1862; AVX512BW-NEXT:    vmovdqa %ymm5, (%rdx)
1863; AVX512BW-NEXT:    vmovdqa %ymm6, (%rcx)
1864; AVX512BW-NEXT:    vmovdqa %ymm7, (%r8)
1865; AVX512BW-NEXT:    vmovdqa %ymm8, (%r9)
1866; AVX512BW-NEXT:    vmovdqa %ymm9, (%r11)
1867; AVX512BW-NEXT:    vmovdqa %ymm10, (%r10)
1868; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
1869; AVX512BW-NEXT:    vzeroupper
1870; AVX512BW-NEXT:    retq
1871;
1872; AVX512BW-FCP-LABEL: load_i32_stride8_vf8:
1873; AVX512BW-FCP:       # %bb.0:
1874; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1875; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1876; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
1877; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1878; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1879; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1880; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
1881; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1882; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1883; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1884; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1885; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1886; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1887; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1888; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1889; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1890; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1891; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1892; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1893; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1894; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
1895; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1896; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1897; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1898; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1899; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
1900; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1901; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1902; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1903; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1904; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1905; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1906; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1907; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1908; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1909; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1910; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1911; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1912; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1913; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1914; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
1915; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1916; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1917; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
1918; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1919; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1920; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1921; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
1922; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
1923; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
1924; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
1925; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
1926; AVX512BW-FCP-NEXT:    vmovdqa %ymm9, (%r11)
1927; AVX512BW-FCP-NEXT:    vmovdqa %ymm10, (%r10)
1928; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1929; AVX512BW-FCP-NEXT:    vzeroupper
1930; AVX512BW-FCP-NEXT:    retq
1931;
1932; AVX512DQ-BW-LABEL: load_i32_stride8_vf8:
1933; AVX512DQ-BW:       # %bb.0:
1934; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1935; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1936; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
1937; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1938; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1939; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
1940; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
1941; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1942; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1943; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1944; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1945; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1946; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1947; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1948; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1949; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1950; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1951; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1952; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1953; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1954; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
1955; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1956; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1957; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1958; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1959; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
1960; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1961; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1962; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1963; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1964; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1965; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1966; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1967; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1968; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1969; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1970; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1971; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1972; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1973; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1974; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
1975; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1976; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1977; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
1978; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1979; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1980; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1981; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rsi)
1982; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%rdx)
1983; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%rcx)
1984; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%r8)
1985; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%r9)
1986; AVX512DQ-BW-NEXT:    vmovdqa %ymm9, (%r11)
1987; AVX512DQ-BW-NEXT:    vmovdqa %ymm10, (%r10)
1988; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rax)
1989; AVX512DQ-BW-NEXT:    vzeroupper
1990; AVX512DQ-BW-NEXT:    retq
1991;
1992; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf8:
1993; AVX512DQ-BW-FCP:       # %bb.0:
1994; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1995; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1996; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
1997; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1998; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1999; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
2000; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
2001; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
2002; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
2003; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
2004; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
2005; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
2006; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
2007; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
2008; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
2009; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
2010; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2011; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
2012; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
2013; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
2014; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
2015; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
2016; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
2017; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
2018; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
2019; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
2020; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2021; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
2022; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
2023; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
2024; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
2025; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2026; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
2027; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
2028; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
2029; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
2030; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2031; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
2032; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
2033; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
2034; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
2035; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2036; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
2037; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
2038; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
2039; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
2040; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
2041; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
2042; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
2043; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
2044; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
2045; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
2046; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm9, (%r11)
2047; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm10, (%r10)
2048; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
2049; AVX512DQ-BW-FCP-NEXT:    vzeroupper
2050; AVX512DQ-BW-FCP-NEXT:    retq
2051  %wide.vec = load <64 x i32>, ptr %in.vec, align 64
2052  %strided.vec0 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
2053  %strided.vec1 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
2054  %strided.vec2 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
2055  %strided.vec3 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
2056  %strided.vec4 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
2057  %strided.vec5 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
2058  %strided.vec6 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
2059  %strided.vec7 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
2060  store <8 x i32> %strided.vec0, ptr %out.vec0, align 64
2061  store <8 x i32> %strided.vec1, ptr %out.vec1, align 64
2062  store <8 x i32> %strided.vec2, ptr %out.vec2, align 64
2063  store <8 x i32> %strided.vec3, ptr %out.vec3, align 64
2064  store <8 x i32> %strided.vec4, ptr %out.vec4, align 64
2065  store <8 x i32> %strided.vec5, ptr %out.vec5, align 64
2066  store <8 x i32> %strided.vec6, ptr %out.vec6, align 64
2067  store <8 x i32> %strided.vec7, ptr %out.vec7, align 64
2068  ret void
2069}
2070
2071define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
2072; SSE-LABEL: load_i32_stride8_vf16:
2073; SSE:       # %bb.0:
2074; SSE-NEXT:    subq $296, %rsp # imm = 0x128
2075; SSE-NEXT:    movaps 288(%rdi), %xmm6
2076; SSE-NEXT:    movaps 352(%rdi), %xmm0
2077; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2078; SSE-NEXT:    movaps 320(%rdi), %xmm5
2079; SSE-NEXT:    movaps 416(%rdi), %xmm2
2080; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2081; SSE-NEXT:    movaps 384(%rdi), %xmm12
2082; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2083; SSE-NEXT:    movaps 480(%rdi), %xmm13
2084; SSE-NEXT:    movaps 448(%rdi), %xmm4
2085; SSE-NEXT:    movaps 160(%rdi), %xmm7
2086; SSE-NEXT:    movaps 128(%rdi), %xmm10
2087; SSE-NEXT:    movaps 224(%rdi), %xmm8
2088; SSE-NEXT:    movaps 192(%rdi), %xmm3
2089; SSE-NEXT:    movaps %xmm3, %xmm9
2090; SSE-NEXT:    unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2091; SSE-NEXT:    movaps %xmm10, %xmm11
2092; SSE-NEXT:    movaps %xmm10, %xmm14
2093; SSE-NEXT:    unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
2094; SSE-NEXT:    movaps %xmm11, %xmm10
2095; SSE-NEXT:    movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0]
2096; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2097; SSE-NEXT:    movaps %xmm4, %xmm10
2098; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
2099; SSE-NEXT:    unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
2100; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1]
2101; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2102; SSE-NEXT:    movaps %xmm12, %xmm9
2103; SSE-NEXT:    movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0]
2104; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2105; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1]
2106; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2107; SSE-NEXT:    movaps %xmm5, %xmm9
2108; SSE-NEXT:    unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
2109; SSE-NEXT:    movaps 256(%rdi), %xmm15
2110; SSE-NEXT:    movaps %xmm15, %xmm0
2111; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
2112; SSE-NEXT:    movaps %xmm0, %xmm10
2113; SSE-NEXT:    movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0]
2114; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2115; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1]
2116; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2117; SSE-NEXT:    movaps 96(%rdi), %xmm10
2118; SSE-NEXT:    movaps 64(%rdi), %xmm9
2119; SSE-NEXT:    movaps %xmm9, %xmm11
2120; SSE-NEXT:    unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
2121; SSE-NEXT:    movaps (%rdi), %xmm2
2122; SSE-NEXT:    movaps 32(%rdi), %xmm12
2123; SSE-NEXT:    movaps %xmm2, %xmm1
2124; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
2125; SSE-NEXT:    movaps %xmm1, %xmm0
2126; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
2127; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2128; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1]
2129; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2130; SSE-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
2131; SSE-NEXT:    unpckhps {{.*#+}} xmm14 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
2132; SSE-NEXT:    unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
2133; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2134; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2135; SSE-NEXT:    # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
2136; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2137; SSE-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
2138; SSE-NEXT:    unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
2139; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2140; SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3]
2141; SSE-NEXT:    movaps %xmm14, %xmm0
2142; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
2143; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2144; SSE-NEXT:    unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1]
2145; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2146; SSE-NEXT:    movaps %xmm13, %xmm0
2147; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
2148; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2149; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1]
2150; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2151; SSE-NEXT:    movaps %xmm15, %xmm0
2152; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
2153; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2154; SSE-NEXT:    unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1]
2155; SSE-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2156; SSE-NEXT:    movaps %xmm2, %xmm0
2157; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0]
2158; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2159; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1]
2160; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2161; SSE-NEXT:    movaps 240(%rdi), %xmm1
2162; SSE-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
2163; SSE-NEXT:    movaps 208(%rdi), %xmm15
2164; SSE-NEXT:    movaps %xmm15, %xmm0
2165; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2166; SSE-NEXT:    movaps 176(%rdi), %xmm2
2167; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2168; SSE-NEXT:    movaps 144(%rdi), %xmm1
2169; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2170; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2171; SSE-NEXT:    movaps %xmm1, %xmm2
2172; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2173; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2174; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2175; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2176; SSE-NEXT:    movaps 496(%rdi), %xmm1
2177; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2178; SSE-NEXT:    movaps 464(%rdi), %xmm5
2179; SSE-NEXT:    movaps %xmm5, %xmm0
2180; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2181; SSE-NEXT:    movaps 432(%rdi), %xmm1
2182; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2183; SSE-NEXT:    movaps 400(%rdi), %xmm6
2184; SSE-NEXT:    movaps %xmm6, %xmm10
2185; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
2186; SSE-NEXT:    movaps %xmm10, %xmm1
2187; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2188; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2189; SSE-NEXT:    unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
2190; SSE-NEXT:    movaps 368(%rdi), %xmm14
2191; SSE-NEXT:    movaps 336(%rdi), %xmm2
2192; SSE-NEXT:    movaps %xmm2, %xmm0
2193; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
2194; SSE-NEXT:    movaps 304(%rdi), %xmm12
2195; SSE-NEXT:    movaps 272(%rdi), %xmm7
2196; SSE-NEXT:    movaps %xmm7, %xmm4
2197; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
2198; SSE-NEXT:    movaps %xmm4, %xmm1
2199; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2200; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2201; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
2202; SSE-NEXT:    movaps 112(%rdi), %xmm13
2203; SSE-NEXT:    movaps 80(%rdi), %xmm1
2204; SSE-NEXT:    movaps %xmm1, %xmm0
2205; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
2206; SSE-NEXT:    movaps 16(%rdi), %xmm8
2207; SSE-NEXT:    movaps 48(%rdi), %xmm11
2208; SSE-NEXT:    movaps %xmm8, %xmm3
2209; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1]
2210; SSE-NEXT:    movaps %xmm3, %xmm9
2211; SSE-NEXT:    movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
2212; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2213; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
2214; SSE-NEXT:    unpckhps (%rsp), %xmm15 # 16-byte Folded Reload
2215; SSE-NEXT:    # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
2216; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2217; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2218; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
2219; SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3]
2220; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
2221; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2222; SSE-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
2223; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2224; SSE-NEXT:    # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
2225; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
2226; SSE-NEXT:    unpckhps {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
2227; SSE-NEXT:    movaps %xmm0, %xmm11
2228; SSE-NEXT:    movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0]
2229; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1]
2230; SSE-NEXT:    movaps %xmm0, %xmm12
2231; SSE-NEXT:    movaps %xmm7, %xmm9
2232; SSE-NEXT:    movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0]
2233; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
2234; SSE-NEXT:    movaps %xmm6, %xmm0
2235; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
2236; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
2237; SSE-NEXT:    movaps %xmm8, %xmm2
2238; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2239; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1]
2240; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2241; SSE-NEXT:    movaps %xmm1, 32(%rsi)
2242; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2243; SSE-NEXT:    movaps %xmm1, 48(%rsi)
2244; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2245; SSE-NEXT:    movaps %xmm1, (%rsi)
2246; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2247; SSE-NEXT:    movaps %xmm1, 16(%rsi)
2248; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2249; SSE-NEXT:    movaps %xmm1, 32(%rdx)
2250; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2251; SSE-NEXT:    movaps %xmm1, 48(%rdx)
2252; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2253; SSE-NEXT:    movaps %xmm1, (%rdx)
2254; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2255; SSE-NEXT:    movaps %xmm1, 16(%rdx)
2256; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2257; SSE-NEXT:    movaps %xmm1, 32(%rcx)
2258; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2259; SSE-NEXT:    movaps %xmm1, 48(%rcx)
2260; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2261; SSE-NEXT:    movaps %xmm1, (%rcx)
2262; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2263; SSE-NEXT:    movaps %xmm1, 16(%rcx)
2264; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2265; SSE-NEXT:    movaps %xmm1, 32(%r8)
2266; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2267; SSE-NEXT:    movaps %xmm1, 48(%r8)
2268; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2269; SSE-NEXT:    movaps %xmm1, (%r8)
2270; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2271; SSE-NEXT:    movaps %xmm1, 16(%r8)
2272; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2273; SSE-NEXT:    movaps %xmm1, 32(%r9)
2274; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2275; SSE-NEXT:    movaps %xmm1, 48(%r9)
2276; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2277; SSE-NEXT:    movaps %xmm1, (%r9)
2278; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2279; SSE-NEXT:    movaps %xmm1, 16(%r9)
2280; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2281; SSE-NEXT:    movaps %xmm4, 32(%rax)
2282; SSE-NEXT:    movaps %xmm10, 48(%rax)
2283; SSE-NEXT:    movaps %xmm3, (%rax)
2284; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2285; SSE-NEXT:    movaps %xmm1, 16(%rax)
2286; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2287; SSE-NEXT:    movaps %xmm0, 48(%rax)
2288; SSE-NEXT:    movaps %xmm9, 32(%rax)
2289; SSE-NEXT:    movaps %xmm11, 16(%rax)
2290; SSE-NEXT:    movaps %xmm2, (%rax)
2291; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2292; SSE-NEXT:    movaps %xmm6, 48(%rax)
2293; SSE-NEXT:    movaps %xmm7, 32(%rax)
2294; SSE-NEXT:    movaps %xmm12, 16(%rax)
2295; SSE-NEXT:    movaps %xmm8, (%rax)
2296; SSE-NEXT:    addq $296, %rsp # imm = 0x128
2297; SSE-NEXT:    retq
2298;
2299; AVX-LABEL: load_i32_stride8_vf16:
2300; AVX:       # %bb.0:
2301; AVX-NEXT:    subq $584, %rsp # imm = 0x248
2302; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
2303; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2304; AVX-NEXT:    vmovaps (%rdi), %xmm12
2305; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
2306; AVX-NEXT:    vmovaps 96(%rdi), %xmm1
2307; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2308; AVX-NEXT:    vmovaps 64(%rdi), %xmm2
2309; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2310; AVX-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2311; AVX-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm9[0]
2312; AVX-NEXT:    vmovaps 160(%rdi), %xmm8
2313; AVX-NEXT:    vmovaps 128(%rdi), %xmm10
2314; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
2315; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm6
2316; AVX-NEXT:    vmovaps 224(%rdi), %xmm0
2317; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2318; AVX-NEXT:    vmovaps 192(%rdi), %xmm1
2319; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2320; AVX-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2321; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm5[0,1,0,1]
2322; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2323; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
2324; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5,6,7]
2325; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2326; AVX-NEXT:    vmovaps 416(%rdi), %xmm11
2327; AVX-NEXT:    vmovaps 384(%rdi), %xmm13
2328; AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
2329; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
2330; AVX-NEXT:    vmovaps 480(%rdi), %xmm0
2331; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2332; AVX-NEXT:    vmovaps 448(%rdi), %xmm1
2333; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2334; AVX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2335; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm3[0,1,0,1]
2336; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
2337; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm6[6,7]
2338; AVX-NEXT:    vmovaps 288(%rdi), %xmm14
2339; AVX-NEXT:    vmovaps 256(%rdi), %xmm15
2340; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
2341; AVX-NEXT:    vmovaps 352(%rdi), %xmm7
2342; AVX-NEXT:    vmovaps 320(%rdi), %xmm6
2343; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
2344; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2345; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2346; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2347; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm12[1,1,1,1]
2348; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2349; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
2350; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3]
2351; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm2
2352; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm10[1,1,1,1]
2353; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3]
2354; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
2355; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
2356; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2357; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2358; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[1,1,1,1]
2359; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
2360; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2361; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
2362; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1]
2363; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3]
2364; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
2365; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2366; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2367; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2368; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
2369; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2370; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2371; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,2,2,2]
2372; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2373; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
2374; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
2375; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm3
2376; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2377; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2378; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2]
2379; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2380; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3]
2381; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
2382; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2383; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2384; AVX-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
2385; AVX-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
2386; AVX-NEXT:    vmovaps (%rsp), %xmm14 # 16-byte Reload
2387; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2]
2388; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2389; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3]
2390; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
2391; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm8
2392; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
2393; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm7[2,2,2,2]
2394; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
2395; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm2[0,1],xmm8[2,3]
2396; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
2397; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2398; AVX-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2399; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1]
2400; AVX-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm0[2],xmm12[3],xmm0[3]
2401; AVX-NEXT:    vmovaps 320(%rdi), %ymm8
2402; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
2403; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3]
2404; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2405; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
2406; AVX-NEXT:    vmovaps 352(%rdi), %ymm5
2407; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2408; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2409; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2410; AVX-NEXT:    vmovaps 416(%rdi), %ymm4
2411; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2412; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
2413; AVX-NEXT:    vmovaps 384(%rdi), %ymm6
2414; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2415; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1]
2416; AVX-NEXT:    vmovaps 448(%rdi), %ymm7
2417; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2418; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm14[2],xmm11[3],xmm14[3]
2419; AVX-NEXT:    vmovaps 480(%rdi), %ymm9
2420; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2421; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3]
2422; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
2423; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2424; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2425; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2426; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
2427; AVX-NEXT:    vmovaps %ymm9, %ymm3
2428; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5]
2429; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2430; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm8[0],ymm5[2],ymm8[2]
2431; AVX-NEXT:    vmovaps %ymm8, %ymm6
2432; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
2433; AVX-NEXT:    vmovaps 288(%rdi), %ymm7
2434; AVX-NEXT:    vmovaps 256(%rdi), %ymm9
2435; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
2436; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
2437; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
2438; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2439; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
2440; AVX-NEXT:    vmovaps 160(%rdi), %ymm11
2441; AVX-NEXT:    vmovaps 128(%rdi), %ymm12
2442; AVX-NEXT:    vmovaps 192(%rdi), %ymm10
2443; AVX-NEXT:    vmovaps 224(%rdi), %ymm13
2444; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2]
2445; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2446; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5]
2447; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2448; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2449; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2450; AVX-NEXT:    vmovaps 64(%rdi), %ymm1
2451; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2452; AVX-NEXT:    vmovaps 96(%rdi), %ymm0
2453; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2454; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2455; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
2456; AVX-NEXT:    vmovaps (%rdi), %ymm1
2457; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2458; AVX-NEXT:    vmovaps 32(%rdi), %ymm15
2459; AVX-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
2460; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
2461; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0]
2462; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
2463; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2464; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2465; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
2466; AVX-NEXT:    vmovaps %ymm3, %ymm8
2467; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2468; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2469; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[1,0],ymm3[1,0],ymm0[5,4],ymm3[5,4]
2470; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7]
2471; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2472; AVX-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5]
2473; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
2474; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm9[1,0],ymm7[5,4],ymm9[5,4]
2475; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
2476; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm14[2,0],xmm4[2,3]
2477; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
2478; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2479; AVX-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
2480; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm12[1,0],ymm11[5,4],ymm12[5,4]
2481; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[2,0],ymm5[2,3],ymm4[6,4],ymm5[6,7]
2482; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2483; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
2484; AVX-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5]
2485; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
2486; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
2487; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[1,0],ymm11[1,0],ymm15[5,4],ymm11[5,4]
2488; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
2489; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm14[2,0],xmm4[2,3]
2490; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
2491; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2492; AVX-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm1[1],ymm8[3],ymm1[3]
2493; AVX-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7]
2494; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm5[2,0],ymm4[4,5],ymm5[6,4]
2495; AVX-NEXT:    vmovaps %ymm6, %ymm3
2496; AVX-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm2[1],ymm6[1],ymm2[3],ymm6[3]
2497; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
2498; AVX-NEXT:    vmovaps %ymm7, %ymm2
2499; AVX-NEXT:    vunpckhps {{.*#+}} ymm14 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7]
2500; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
2501; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,0]
2502; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2503; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2504; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
2505; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2506; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2507; AVX-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7]
2508; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,0],ymm4[4,5],ymm0[6,4]
2509; AVX-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm10[1],ymm12[3],ymm10[3]
2510; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
2511; AVX-NEXT:    vunpckhps {{.*#+}} ymm14 = ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[6],ymm15[6],ymm11[7],ymm15[7]
2512; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
2513; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,0]
2514; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2515; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2516; AVX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
2517; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2518; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
2519; AVX-NEXT:    # ymm4 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4]
2520; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm0[2,3],ymm4[6,4],ymm0[6,7]
2521; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
2522; AVX-NEXT:    # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
2523; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm2[3,0],ymm9[3,0],ymm2[7,4],ymm9[7,4]
2524; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
2525; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
2526; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm9[2,0],xmm4[2,3]
2527; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2528; AVX-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm13[2],ymm7[3],ymm13[3],ymm7[6],ymm13[6],ymm7[7],ymm13[7]
2529; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm6[3,0],ymm1[7,4],ymm6[7,4]
2530; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7]
2531; AVX-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
2532; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[3,0],ymm11[3,0],ymm15[7,4],ymm11[7,4]
2533; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm2
2534; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
2535; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
2536; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
2537; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2538; AVX-NEXT:    vmovaps %ymm2, 32(%rsi)
2539; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2540; AVX-NEXT:    vmovaps %ymm2, (%rsi)
2541; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2542; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)
2543; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2544; AVX-NEXT:    vmovaps %ymm2, (%rdx)
2545; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2546; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
2547; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2548; AVX-NEXT:    vmovaps %ymm2, (%rcx)
2549; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2550; AVX-NEXT:    vmovaps %ymm2, 32(%r8)
2551; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2552; AVX-NEXT:    vmovaps %ymm2, (%r8)
2553; AVX-NEXT:    vmovups (%rsp), %ymm2 # 32-byte Reload
2554; AVX-NEXT:    vmovaps %ymm2, 32(%r9)
2555; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2556; AVX-NEXT:    vmovaps %ymm2, (%r9)
2557; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2558; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2559; AVX-NEXT:    vmovaps %ymm2, 32(%rax)
2560; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2561; AVX-NEXT:    vmovaps %ymm2, (%rax)
2562; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2563; AVX-NEXT:    vmovaps %ymm5, 32(%rax)
2564; AVX-NEXT:    vmovaps %ymm14, (%rax)
2565; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2566; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
2567; AVX-NEXT:    vmovaps %ymm1, (%rax)
2568; AVX-NEXT:    addq $584, %rsp # imm = 0x248
2569; AVX-NEXT:    vzeroupper
2570; AVX-NEXT:    retq
2571;
2572; AVX2-LABEL: load_i32_stride8_vf16:
2573; AVX2:       # %bb.0:
2574; AVX2-NEXT:    subq $456, %rsp # imm = 0x1C8
2575; AVX2-NEXT:    vmovaps 288(%rdi), %xmm8
2576; AVX2-NEXT:    vmovaps 256(%rdi), %xmm9
2577; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2578; AVX2-NEXT:    vmovaps 352(%rdi), %xmm1
2579; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2580; AVX2-NEXT:    vbroadcastss %xmm1, %xmm2
2581; AVX2-NEXT:    vmovaps 320(%rdi), %xmm1
2582; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2583; AVX2-NEXT:    vbroadcastss %xmm1, %xmm3
2584; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2585; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2586; AVX2-NEXT:    vmovaps 416(%rdi), %xmm1
2587; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2588; AVX2-NEXT:    vmovaps 384(%rdi), %xmm2
2589; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2590; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2591; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
2592; AVX2-NEXT:    vmovaps 480(%rdi), %xmm1
2593; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2594; AVX2-NEXT:    vbroadcastss %xmm1, %xmm3
2595; AVX2-NEXT:    vmovaps 448(%rdi), %xmm1
2596; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2597; AVX2-NEXT:    vbroadcastss %xmm1, %xmm10
2598; AVX2-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
2599; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2600; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2601; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2602; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2603; AVX2-NEXT:    vmovaps 160(%rdi), %xmm6
2604; AVX2-NEXT:    vmovaps 128(%rdi), %xmm15
2605; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
2606; AVX2-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2607; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
2608; AVX2-NEXT:    vmovaps 224(%rdi), %xmm4
2609; AVX2-NEXT:    vbroadcastss %xmm4, %xmm3
2610; AVX2-NEXT:    vmovaps %xmm4, (%rsp) # 16-byte Spill
2611; AVX2-NEXT:    vmovaps 192(%rdi), %xmm12
2612; AVX2-NEXT:    vbroadcastss %xmm12, %xmm11
2613; AVX2-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
2614; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2615; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2616; AVX2-NEXT:    vmovaps 96(%rdi), %xmm5
2617; AVX2-NEXT:    vbroadcastss %xmm5, %xmm2
2618; AVX2-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2619; AVX2-NEXT:    vmovaps 64(%rdi), %xmm13
2620; AVX2-NEXT:    vbroadcastss %xmm13, %xmm3
2621; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2622; AVX2-NEXT:    vmovaps (%rdi), %xmm11
2623; AVX2-NEXT:    vmovaps 32(%rdi), %xmm10
2624; AVX2-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
2625; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
2626; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2627; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2628; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
2629; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
2630; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
2631; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2632; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
2633; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2634; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1]
2635; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3]
2636; AVX2-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2637; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2638; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2639; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2640; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1]
2641; AVX2-NEXT:    vmovaps %xmm8, %xmm6
2642; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
2643; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2644; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2645; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
2646; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2647; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2648; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2649; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2650; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2651; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2652; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1]
2653; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2654; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
2655; AVX2-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2656; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2657; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2658; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2659; AVX2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
2660; AVX2-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
2661; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2]
2662; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
2663; AVX2-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
2664; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
2665; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
2666; AVX2-NEXT:    vmovaps %xmm8, %xmm7
2667; AVX2-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2]
2668; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
2669; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3]
2670; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2671; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2672; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
2673; AVX2-NEXT:    # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3]
2674; AVX2-NEXT:    vmovaps (%rsp), %xmm9 # 16-byte Reload
2675; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2]
2676; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3]
2677; AVX2-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
2678; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm6
2679; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
2680; AVX2-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
2681; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2682; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2]
2683; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
2684; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
2685; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
2686; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2687; AVX2-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
2688; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2689; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2690; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
2691; AVX2-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
2692; AVX2-NEXT:    vmovaps 96(%rdi), %ymm5
2693; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2694; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2695; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
2696; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2697; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
2698; AVX2-NEXT:    vmovaps 64(%rdi), %ymm6
2699; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2700; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2701; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2702; AVX2-NEXT:    vmovaps 32(%rdi), %ymm15
2703; AVX2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
2704; AVX2-NEXT:    vmovaps (%rdi), %ymm14
2705; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2706; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
2707; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2708; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2709; AVX2-NEXT:    vmovaps 224(%rdi), %ymm3
2710; AVX2-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
2711; AVX2-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
2712; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
2713; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2714; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2715; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
2716; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2717; AVX2-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
2718; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2]
2719; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2720; AVX2-NEXT:    vmovaps 192(%rdi), %ymm1
2721; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2722; AVX2-NEXT:    vmovaps 160(%rdi), %ymm7
2723; AVX2-NEXT:    vmovaps 128(%rdi), %ymm13
2724; AVX2-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5]
2725; AVX2-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
2726; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2]
2727; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2728; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2729; AVX2-NEXT:    vmovaps 288(%rdi), %ymm9
2730; AVX2-NEXT:    vmovaps 256(%rdi), %ymm8
2731; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
2732; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2733; AVX2-NEXT:    vmovaps 352(%rdi), %ymm1
2734; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2735; AVX2-NEXT:    vmovaps 320(%rdi), %ymm2
2736; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2737; AVX2-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
2738; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2]
2739; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3]
2740; AVX2-NEXT:    vmovaps 480(%rdi), %ymm0
2741; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2742; AVX2-NEXT:    vmovaps 448(%rdi), %ymm12
2743; AVX2-NEXT:    vmovaps 416(%rdi), %ymm6
2744; AVX2-NEXT:    vmovaps 384(%rdi), %ymm10
2745; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
2746; AVX2-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
2747; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
2748; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2749; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2750; AVX2-NEXT:    vbroadcastss 148(%rdi), %ymm0
2751; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
2752; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
2753; AVX2-NEXT:    vextractf128 $1, %ymm11, %xmm2
2754; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5]
2755; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
2756; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
2757; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
2758; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2759; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2760; AVX2-NEXT:    vbroadcastss 404(%rdi), %ymm0
2761; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
2762; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
2763; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm1
2764; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5]
2765; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
2766; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
2767; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2768; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2769; AVX2-NEXT:    vbroadcastss 248(%rdi), %ymm0
2770; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2771; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
2772; AVX2-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
2773; AVX2-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7]
2774; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2775; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
2776; AVX2-NEXT:    # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
2777; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm1
2778; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2]
2779; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
2780; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2781; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2782; AVX2-NEXT:    vbroadcastss 504(%rdi), %ymm0
2783; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
2784; AVX2-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
2785; AVX2-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
2786; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2787; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
2788; AVX2-NEXT:    # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7]
2789; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm8
2790; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2]
2791; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
2792; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2793; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
2794; AVX2-NEXT:    vbroadcastss 220(%rdi), %ymm8
2795; AVX2-NEXT:    vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
2796; AVX2-NEXT:    # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
2797; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
2798; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
2799; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
2800; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
2801; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
2802; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2803; AVX2-NEXT:    vbroadcastss 476(%rdi), %ymm4
2804; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
2805; AVX2-NEXT:    # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
2806; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
2807; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm4
2808; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
2809; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
2810; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
2811; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
2812; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2813; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
2814; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2815; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
2816; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2817; AVX2-NEXT:    vmovaps %ymm4, 32(%rdx)
2818; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2819; AVX2-NEXT:    vmovaps %ymm4, (%rdx)
2820; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2821; AVX2-NEXT:    vmovaps %ymm4, 32(%rcx)
2822; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2823; AVX2-NEXT:    vmovaps %ymm4, (%rcx)
2824; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2825; AVX2-NEXT:    vmovaps %ymm4, 32(%r8)
2826; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2827; AVX2-NEXT:    vmovaps %ymm4, (%r8)
2828; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2829; AVX2-NEXT:    vmovaps %ymm4, 32(%r9)
2830; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2831; AVX2-NEXT:    vmovaps %ymm4, (%r9)
2832; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2833; AVX2-NEXT:    vmovaps %ymm11, 32(%rax)
2834; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2835; AVX2-NEXT:    vmovaps %ymm4, (%rax)
2836; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2837; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
2838; AVX2-NEXT:    vmovaps %ymm1, (%rax)
2839; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2840; AVX2-NEXT:    vmovaps %ymm3, 32(%rax)
2841; AVX2-NEXT:    vmovaps %ymm2, (%rax)
2842; AVX2-NEXT:    addq $456, %rsp # imm = 0x1C8
2843; AVX2-NEXT:    vzeroupper
2844; AVX2-NEXT:    retq
2845;
2846; AVX2-FP-LABEL: load_i32_stride8_vf16:
2847; AVX2-FP:       # %bb.0:
2848; AVX2-FP-NEXT:    subq $456, %rsp # imm = 0x1C8
2849; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm8
2850; AVX2-FP-NEXT:    vmovaps 256(%rdi), %xmm9
2851; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2852; AVX2-FP-NEXT:    vmovaps 352(%rdi), %xmm1
2853; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2854; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm2
2855; AVX2-FP-NEXT:    vmovaps 320(%rdi), %xmm1
2856; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2857; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm3
2858; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2859; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2860; AVX2-FP-NEXT:    vmovaps 416(%rdi), %xmm1
2861; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2862; AVX2-FP-NEXT:    vmovaps 384(%rdi), %xmm2
2863; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2864; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2865; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
2866; AVX2-FP-NEXT:    vmovaps 480(%rdi), %xmm1
2867; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2868; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm3
2869; AVX2-FP-NEXT:    vmovaps 448(%rdi), %xmm1
2870; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2871; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm10
2872; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
2873; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2874; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2875; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2876; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2877; AVX2-FP-NEXT:    vmovaps 160(%rdi), %xmm6
2878; AVX2-FP-NEXT:    vmovaps 128(%rdi), %xmm15
2879; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
2880; AVX2-FP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2881; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
2882; AVX2-FP-NEXT:    vmovaps 224(%rdi), %xmm4
2883; AVX2-FP-NEXT:    vbroadcastss %xmm4, %xmm3
2884; AVX2-FP-NEXT:    vmovaps %xmm4, (%rsp) # 16-byte Spill
2885; AVX2-FP-NEXT:    vmovaps 192(%rdi), %xmm12
2886; AVX2-FP-NEXT:    vbroadcastss %xmm12, %xmm11
2887; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
2888; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2889; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2890; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm5
2891; AVX2-FP-NEXT:    vbroadcastss %xmm5, %xmm2
2892; AVX2-FP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2893; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm13
2894; AVX2-FP-NEXT:    vbroadcastss %xmm13, %xmm3
2895; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2896; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm11
2897; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm10
2898; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
2899; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
2900; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2901; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2902; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
2903; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
2904; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
2905; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2906; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
2907; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2908; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1]
2909; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3]
2910; AVX2-FP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2911; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2912; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2913; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2914; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1]
2915; AVX2-FP-NEXT:    vmovaps %xmm8, %xmm6
2916; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
2917; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2918; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2919; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
2920; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2921; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2922; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2923; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2924; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2925; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2926; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1]
2927; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2928; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
2929; AVX2-FP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2930; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2931; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2932; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2933; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
2934; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
2935; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2]
2936; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
2937; AVX2-FP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
2938; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
2939; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
2940; AVX2-FP-NEXT:    vmovaps %xmm8, %xmm7
2941; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2]
2942; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
2943; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3]
2944; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2945; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2946; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
2947; AVX2-FP-NEXT:    # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3]
2948; AVX2-FP-NEXT:    vmovaps (%rsp), %xmm9 # 16-byte Reload
2949; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2]
2950; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3]
2951; AVX2-FP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
2952; AVX2-FP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm6
2953; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
2954; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
2955; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2956; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2]
2957; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
2958; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
2959; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
2960; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2961; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
2962; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2963; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2964; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
2965; AVX2-FP-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
2966; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm5
2967; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2968; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2969; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
2970; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2971; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
2972; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm6
2973; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2974; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2975; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2976; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm15
2977; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
2978; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm14
2979; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2980; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
2981; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2982; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2983; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm3
2984; AVX2-FP-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
2985; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
2986; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
2987; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2988; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2989; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
2990; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
2991; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
2992; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2]
2993; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2994; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm1
2995; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2996; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm7
2997; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm13
2998; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5]
2999; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
3000; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2]
3001; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3002; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3003; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm9
3004; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm8
3005; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
3006; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
3007; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm1
3008; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3009; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm2
3010; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3011; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
3012; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2]
3013; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3]
3014; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm0
3015; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3016; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm12
3017; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm6
3018; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm10
3019; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
3020; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
3021; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
3022; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3023; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3024; AVX2-FP-NEXT:    vbroadcastss 148(%rdi), %ymm0
3025; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
3026; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
3027; AVX2-FP-NEXT:    vextractf128 $1, %ymm11, %xmm2
3028; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5]
3029; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
3030; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
3031; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
3032; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3033; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3034; AVX2-FP-NEXT:    vbroadcastss 404(%rdi), %ymm0
3035; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
3036; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
3037; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm1
3038; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5]
3039; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
3040; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
3041; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
3042; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3043; AVX2-FP-NEXT:    vbroadcastss 248(%rdi), %ymm0
3044; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3045; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
3046; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
3047; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7]
3048; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3049; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
3050; AVX2-FP-NEXT:    # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
3051; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm1
3052; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2]
3053; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
3054; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
3055; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3056; AVX2-FP-NEXT:    vbroadcastss 504(%rdi), %ymm0
3057; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
3058; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
3059; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
3060; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3061; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
3062; AVX2-FP-NEXT:    # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7]
3063; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm8
3064; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2]
3065; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
3066; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
3067; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
3068; AVX2-FP-NEXT:    vbroadcastss 220(%rdi), %ymm8
3069; AVX2-FP-NEXT:    vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
3070; AVX2-FP-NEXT:    # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
3071; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
3072; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
3073; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
3074; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
3075; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
3076; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
3077; AVX2-FP-NEXT:    vbroadcastss 476(%rdi), %ymm4
3078; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3079; AVX2-FP-NEXT:    # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
3080; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
3081; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm4
3082; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
3083; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
3084; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
3085; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3086; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3087; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
3088; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3089; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
3090; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3091; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
3092; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3093; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
3094; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3095; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
3096; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3097; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
3098; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3099; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%r8)
3100; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3101; AVX2-FP-NEXT:    vmovaps %ymm4, (%r8)
3102; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3103; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%r9)
3104; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3105; AVX2-FP-NEXT:    vmovaps %ymm4, (%r9)
3106; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3107; AVX2-FP-NEXT:    vmovaps %ymm11, 32(%rax)
3108; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3109; AVX2-FP-NEXT:    vmovaps %ymm4, (%rax)
3110; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3111; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
3112; AVX2-FP-NEXT:    vmovaps %ymm1, (%rax)
3113; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3114; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rax)
3115; AVX2-FP-NEXT:    vmovaps %ymm2, (%rax)
3116; AVX2-FP-NEXT:    addq $456, %rsp # imm = 0x1C8
3117; AVX2-FP-NEXT:    vzeroupper
3118; AVX2-FP-NEXT:    retq
3119;
3120; AVX2-FCP-LABEL: load_i32_stride8_vf16:
3121; AVX2-FCP:       # %bb.0:
3122; AVX2-FCP-NEXT:    subq $456, %rsp # imm = 0x1C8
3123; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %xmm8
3124; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %xmm9
3125; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
3126; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %xmm1
3127; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3128; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm2
3129; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %xmm1
3130; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3131; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm3
3132; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3133; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
3134; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %xmm1
3135; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3136; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %xmm2
3137; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3138; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3139; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
3140; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %xmm1
3141; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3142; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm3
3143; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %xmm1
3144; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3145; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm10
3146; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
3147; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
3148; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
3149; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3150; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3151; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %xmm6
3152; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %xmm15
3153; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
3154; AVX2-FCP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3155; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
3156; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %xmm4
3157; AVX2-FCP-NEXT:    vbroadcastss %xmm4, %xmm3
3158; AVX2-FCP-NEXT:    vmovaps %xmm4, (%rsp) # 16-byte Spill
3159; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %xmm12
3160; AVX2-FCP-NEXT:    vbroadcastss %xmm12, %xmm11
3161; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
3162; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
3163; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
3164; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm5
3165; AVX2-FCP-NEXT:    vbroadcastss %xmm5, %xmm2
3166; AVX2-FCP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3167; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm13
3168; AVX2-FCP-NEXT:    vbroadcastss %xmm13, %xmm3
3169; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3170; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm11
3171; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm10
3172; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
3173; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
3174; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3175; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3176; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
3177; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
3178; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
3179; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3180; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
3181; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3182; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1]
3183; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3]
3184; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
3185; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
3186; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3187; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3188; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1]
3189; AVX2-FCP-NEXT:    vmovaps %xmm8, %xmm6
3190; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
3191; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3192; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3193; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
3194; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3195; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3196; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3197; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3198; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3199; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3200; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1]
3201; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3202; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
3203; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
3204; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
3205; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3206; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3207; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
3208; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
3209; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2]
3210; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
3211; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
3212; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
3213; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
3214; AVX2-FCP-NEXT:    vmovaps %xmm8, %xmm7
3215; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2]
3216; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
3217; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3]
3218; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3219; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3220; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
3221; AVX2-FCP-NEXT:    # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3]
3222; AVX2-FCP-NEXT:    vmovaps (%rsp), %xmm9 # 16-byte Reload
3223; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2]
3224; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3]
3225; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
3226; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm6
3227; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
3228; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
3229; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3230; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2]
3231; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
3232; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
3233; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
3234; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3235; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
3236; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
3237; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3238; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
3239; AVX2-FCP-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
3240; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm5
3241; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3242; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
3243; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
3244; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3245; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
3246; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm6
3247; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3248; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3249; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3250; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm15
3251; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
3252; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm14
3253; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
3254; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
3255; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3256; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
3257; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm3
3258; AVX2-FCP-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
3259; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
3260; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
3261; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3262; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3263; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
3264; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
3265; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
3266; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2]
3267; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3268; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
3269; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3270; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm7
3271; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm13
3272; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5]
3273; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
3274; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2]
3275; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3276; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3277; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm9
3278; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm8
3279; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
3280; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
3281; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm1
3282; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3283; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm2
3284; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3285; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
3286; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2]
3287; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3]
3288; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm0
3289; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3290; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm12
3291; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm6
3292; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm10
3293; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
3294; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
3295; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
3296; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3297; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3298; AVX2-FCP-NEXT:    vbroadcastss 148(%rdi), %ymm0
3299; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
3300; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
3301; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm2
3302; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5]
3303; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
3304; AVX2-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
3305; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
3306; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3307; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3308; AVX2-FCP-NEXT:    vbroadcastss 404(%rdi), %ymm0
3309; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
3310; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
3311; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm1
3312; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5]
3313; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
3314; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
3315; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
3316; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3317; AVX2-FCP-NEXT:    vbroadcastss 248(%rdi), %ymm0
3318; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3319; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
3320; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
3321; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7]
3322; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3323; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
3324; AVX2-FCP-NEXT:    # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
3325; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm1
3326; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2]
3327; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
3328; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
3329; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3330; AVX2-FCP-NEXT:    vbroadcastss 504(%rdi), %ymm0
3331; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
3332; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
3333; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
3334; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3335; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
3336; AVX2-FCP-NEXT:    # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7]
3337; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm8
3338; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2]
3339; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
3340; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
3341; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
3342; AVX2-FCP-NEXT:    vbroadcastss 220(%rdi), %ymm8
3343; AVX2-FCP-NEXT:    vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
3344; AVX2-FCP-NEXT:    # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
3345; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
3346; AVX2-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
3347; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
3348; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm4
3349; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
3350; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
3351; AVX2-FCP-NEXT:    vbroadcastss 476(%rdi), %ymm4
3352; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3353; AVX2-FCP-NEXT:    # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
3354; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
3355; AVX2-FCP-NEXT:    vextractf128 $1, %ymm7, %xmm4
3356; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
3357; AVX2-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
3358; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
3359; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3360; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3361; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rsi)
3362; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3363; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
3364; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3365; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rdx)
3366; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3367; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
3368; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3369; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rcx)
3370; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3371; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
3372; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3373; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%r8)
3374; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3375; AVX2-FCP-NEXT:    vmovaps %ymm4, (%r8)
3376; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3377; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%r9)
3378; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3379; AVX2-FCP-NEXT:    vmovaps %ymm4, (%r9)
3380; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3381; AVX2-FCP-NEXT:    vmovaps %ymm11, 32(%rax)
3382; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3383; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rax)
3384; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3385; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
3386; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rax)
3387; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3388; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rax)
3389; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
3390; AVX2-FCP-NEXT:    addq $456, %rsp # imm = 0x1C8
3391; AVX2-FCP-NEXT:    vzeroupper
3392; AVX2-FCP-NEXT:    retq
3393;
3394; AVX512-LABEL: load_i32_stride8_vf16:
3395; AVX512:       # %bb.0:
3396; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3397; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3398; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
3399; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
3400; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm2
3401; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm1
3402; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm4
3403; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm5
3404; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm3
3405; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm7
3406; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm6
3407; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3408; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3409; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm9
3410; AVX512-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3411; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
3412; AVX512-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
3413; AVX512-NEXT:    movb $-64, %dil
3414; AVX512-NEXT:    kmovw %edi, %k1
3415; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
3416; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm9
3417; AVX512-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
3418; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
3419; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3420; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3421; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3422; AVX512-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3423; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm10
3424; AVX512-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
3425; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm11
3426; AVX512-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
3427; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
3428; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm10
3429; AVX512-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
3430; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
3431; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3432; AVX512-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3433; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3434; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3435; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm11
3436; AVX512-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
3437; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm12
3438; AVX512-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
3439; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
3440; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm11
3441; AVX512-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
3442; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3443; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3444; AVX512-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3445; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3446; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3447; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm12
3448; AVX512-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
3449; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm13
3450; AVX512-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
3451; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
3452; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm12
3453; AVX512-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
3454; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
3455; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3456; AVX512-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3457; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3458; AVX512-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3459; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm13
3460; AVX512-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
3461; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm14
3462; AVX512-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
3463; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
3464; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm13
3465; AVX512-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
3466; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
3467; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3468; AVX512-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3469; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3470; AVX512-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3471; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm14
3472; AVX512-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
3473; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm15
3474; AVX512-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
3475; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
3476; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm14
3477; AVX512-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
3478; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
3479; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3480; AVX512-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3481; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3482; AVX512-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3483; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm15
3484; AVX512-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
3485; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm16
3486; AVX512-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
3487; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
3488; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm15
3489; AVX512-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
3490; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3491; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3492; AVX512-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3493; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3494; AVX512-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3495; AVX512-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
3496; AVX512-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
3497; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
3498; AVX512-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
3499; AVX512-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
3500; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3501; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3502; AVX512-NEXT:    vmovdqa64 %zmm8, (%rsi)
3503; AVX512-NEXT:    vmovdqa64 %zmm9, (%rdx)
3504; AVX512-NEXT:    vmovdqa64 %zmm10, (%rcx)
3505; AVX512-NEXT:    vmovdqa64 %zmm11, (%r8)
3506; AVX512-NEXT:    vmovdqa64 %zmm12, (%r9)
3507; AVX512-NEXT:    vmovdqa64 %zmm13, (%r11)
3508; AVX512-NEXT:    vmovdqa64 %zmm14, (%r10)
3509; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
3510; AVX512-NEXT:    vzeroupper
3511; AVX512-NEXT:    retq
3512;
3513; AVX512-FCP-LABEL: load_i32_stride8_vf16:
3514; AVX512-FCP:       # %bb.0:
3515; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3516; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3517; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
3518; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
3519; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
3520; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
3521; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
3522; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
3523; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm3
3524; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
3525; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
3526; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3527; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3528; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
3529; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3530; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
3531; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
3532; AVX512-FCP-NEXT:    movb $-64, %dil
3533; AVX512-FCP-NEXT:    kmovw %edi, %k1
3534; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
3535; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
3536; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
3537; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
3538; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3539; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3540; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3541; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3542; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
3543; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
3544; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm11
3545; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
3546; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
3547; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
3548; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
3549; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
3550; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3551; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3552; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3553; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3554; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
3555; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
3556; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
3557; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
3558; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
3559; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
3560; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
3561; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3562; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3563; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3564; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3565; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3566; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
3567; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
3568; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
3569; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
3570; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
3571; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm12
3572; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
3573; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
3574; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3575; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3576; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3577; AVX512-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3578; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
3579; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
3580; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm14
3581; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
3582; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
3583; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13
3584; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
3585; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
3586; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3587; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3588; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3589; AVX512-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3590; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14
3591; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
3592; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15
3593; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
3594; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
3595; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm14
3596; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
3597; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
3598; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3599; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3600; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3601; AVX512-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3602; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm15
3603; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
3604; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm16
3605; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
3606; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
3607; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm15
3608; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
3609; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3610; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3611; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3612; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3613; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3614; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
3615; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
3616; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
3617; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
3618; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
3619; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3620; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3621; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
3622; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, (%rdx)
3623; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, (%rcx)
3624; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, (%r8)
3625; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, (%r9)
3626; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, (%r11)
3627; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%r10)
3628; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
3629; AVX512-FCP-NEXT:    vzeroupper
3630; AVX512-FCP-NEXT:    retq
3631;
3632; AVX512DQ-LABEL: load_i32_stride8_vf16:
3633; AVX512DQ:       # %bb.0:
3634; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3635; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3636; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r11
3637; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
3638; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm2
3639; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm1
3640; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm4
3641; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm5
3642; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm3
3643; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm7
3644; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm6
3645; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3646; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3647; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm9
3648; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3649; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
3650; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
3651; AVX512DQ-NEXT:    movb $-64, %dil
3652; AVX512DQ-NEXT:    kmovw %edi, %k1
3653; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
3654; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm9
3655; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
3656; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
3657; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3658; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3659; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3660; AVX512DQ-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3661; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm10
3662; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
3663; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm11
3664; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
3665; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
3666; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm10
3667; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
3668; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
3669; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3670; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3671; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3672; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3673; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm11
3674; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
3675; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm12
3676; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
3677; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
3678; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm11
3679; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
3680; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3681; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3682; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3683; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3684; AVX512DQ-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3685; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm12
3686; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
3687; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm13
3688; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
3689; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
3690; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm12
3691; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
3692; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
3693; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3694; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3695; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3696; AVX512DQ-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3697; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm13
3698; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
3699; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm14
3700; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
3701; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
3702; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm13
3703; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
3704; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
3705; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3706; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3707; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3708; AVX512DQ-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3709; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm14
3710; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
3711; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm15
3712; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
3713; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
3714; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm14
3715; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
3716; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
3717; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3718; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3719; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3720; AVX512DQ-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3721; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm15
3722; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
3723; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm16
3724; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
3725; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
3726; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm15
3727; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
3728; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3729; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3730; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3731; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3732; AVX512DQ-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3733; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
3734; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
3735; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
3736; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
3737; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
3738; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3739; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3740; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rsi)
3741; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rdx)
3742; AVX512DQ-NEXT:    vmovdqa64 %zmm10, (%rcx)
3743; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%r8)
3744; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%r9)
3745; AVX512DQ-NEXT:    vmovdqa64 %zmm13, (%r11)
3746; AVX512DQ-NEXT:    vmovdqa64 %zmm14, (%r10)
3747; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
3748; AVX512DQ-NEXT:    vzeroupper
3749; AVX512DQ-NEXT:    retq
3750;
3751; AVX512DQ-FCP-LABEL: load_i32_stride8_vf16:
3752; AVX512DQ-FCP:       # %bb.0:
3753; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3754; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3755; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
3756; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
3757; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
3758; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
3759; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
3760; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
3761; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm3
3762; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
3763; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
3764; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3765; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3766; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
3767; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3768; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
3769; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
3770; AVX512DQ-FCP-NEXT:    movb $-64, %dil
3771; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
3772; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
3773; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
3774; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
3775; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
3776; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3777; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3778; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3779; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3780; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
3781; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
3782; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm11
3783; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
3784; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
3785; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
3786; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
3787; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
3788; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3789; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3790; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3791; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3792; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
3793; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
3794; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
3795; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
3796; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
3797; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
3798; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
3799; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3800; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3801; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3802; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3803; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3804; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
3805; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
3806; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
3807; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
3808; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
3809; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm12
3810; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
3811; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
3812; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3813; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3814; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3815; AVX512DQ-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3816; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
3817; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
3818; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm14
3819; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
3820; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
3821; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13
3822; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
3823; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
3824; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3825; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3826; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3827; AVX512DQ-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3828; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14
3829; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
3830; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15
3831; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
3832; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
3833; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm14
3834; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
3835; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
3836; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3837; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3838; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3839; AVX512DQ-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3840; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm15
3841; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
3842; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm16
3843; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
3844; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
3845; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm15
3846; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
3847; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3848; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3849; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3850; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3851; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3852; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
3853; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
3854; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
3855; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
3856; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
3857; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3858; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3859; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
3860; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%rdx)
3861; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%rcx)
3862; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%r8)
3863; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, (%r9)
3864; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, (%r11)
3865; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, (%r10)
3866; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
3867; AVX512DQ-FCP-NEXT:    vzeroupper
3868; AVX512DQ-FCP-NEXT:    retq
3869;
3870; AVX512BW-LABEL: load_i32_stride8_vf16:
3871; AVX512BW:       # %bb.0:
3872; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3873; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3874; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
3875; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3876; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
3877; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
3878; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
3879; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm5
3880; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
3881; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
3882; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
3883; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3884; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3885; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9
3886; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3887; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
3888; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
3889; AVX512BW-NEXT:    movb $-64, %dil
3890; AVX512BW-NEXT:    kmovd %edi, %k1
3891; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
3892; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
3893; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
3894; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
3895; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3896; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3897; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3898; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3899; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
3900; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
3901; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm11
3902; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
3903; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
3904; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
3905; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
3906; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
3907; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3908; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3909; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3910; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3911; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
3912; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
3913; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
3914; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
3915; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
3916; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11
3917; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
3918; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3919; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3920; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3921; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3922; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3923; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
3924; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
3925; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
3926; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
3927; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
3928; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12
3929; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
3930; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
3931; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3932; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3933; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3934; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3935; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
3936; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
3937; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm14
3938; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
3939; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
3940; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
3941; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
3942; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
3943; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3944; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3945; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3946; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3947; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm14
3948; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
3949; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm15
3950; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
3951; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
3952; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
3953; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
3954; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
3955; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3956; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3957; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3958; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3959; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm15
3960; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
3961; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm16
3962; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
3963; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
3964; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
3965; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
3966; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3967; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3968; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3969; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3970; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3971; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
3972; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
3973; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
3974; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
3975; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
3976; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3977; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3978; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
3979; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
3980; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rcx)
3981; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%r8)
3982; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%r9)
3983; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%r11)
3984; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%r10)
3985; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
3986; AVX512BW-NEXT:    vzeroupper
3987; AVX512BW-NEXT:    retq
3988;
3989; AVX512BW-FCP-LABEL: load_i32_stride8_vf16:
3990; AVX512BW-FCP:       # %bb.0:
3991; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3992; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3993; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
3994; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
3995; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
3996; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
3997; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
3998; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
3999; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm3
4000; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
4001; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
4002; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
4003; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4004; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
4005; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
4006; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
4007; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
4008; AVX512BW-FCP-NEXT:    movb $-64, %dil
4009; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
4010; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
4011; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
4012; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
4013; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
4014; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
4015; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
4016; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
4017; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4018; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
4019; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
4020; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm11
4021; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
4022; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
4023; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
4024; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
4025; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
4026; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
4027; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
4028; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
4029; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4030; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
4031; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
4032; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
4033; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
4034; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
4035; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
4036; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
4037; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
4038; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4039; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
4040; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
4041; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4042; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
4043; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
4044; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
4045; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
4046; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
4047; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm12
4048; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
4049; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
4050; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
4051; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
4052; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
4053; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4054; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
4055; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
4056; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm14
4057; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
4058; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
4059; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13
4060; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
4061; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
4062; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
4063; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
4064; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
4065; AVX512BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4066; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14
4067; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
4068; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15
4069; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
4070; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
4071; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm14
4072; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
4073; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
4074; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
4075; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
4076; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
4077; AVX512BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4078; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm15
4079; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
4080; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm16
4081; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
4082; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
4083; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm15
4084; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
4085; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
4086; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
4087; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
4088; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
4089; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4090; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
4091; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
4092; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
4093; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
4094; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
4095; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4096; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
4097; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
4098; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rdx)
4099; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rcx)
4100; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, (%r8)
4101; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%r9)
4102; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r11)
4103; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, (%r10)
4104; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
4105; AVX512BW-FCP-NEXT:    vzeroupper
4106; AVX512BW-FCP-NEXT:    retq
4107;
4108; AVX512DQ-BW-LABEL: load_i32_stride8_vf16:
4109; AVX512DQ-BW:       # %bb.0:
4110; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4111; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4112; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
4113; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
4114; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
4115; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
4116; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
4117; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm5
4118; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
4119; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
4120; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
4121; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
4122; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4123; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm9
4124; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
4125; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
4126; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
4127; AVX512DQ-BW-NEXT:    movb $-64, %dil
4128; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
4129; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
4130; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm9
4131; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
4132; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
4133; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
4134; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
4135; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
4136; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4137; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm10
4138; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
4139; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm11
4140; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
4141; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
4142; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm10
4143; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
4144; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
4145; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
4146; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
4147; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
4148; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4149; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm11
4150; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
4151; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm12
4152; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
4153; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
4154; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm11
4155; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
4156; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
4157; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4158; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
4159; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
4160; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4161; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm12
4162; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
4163; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm13
4164; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
4165; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
4166; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm12
4167; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
4168; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
4169; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
4170; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
4171; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
4172; AVX512DQ-BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4173; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm13
4174; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
4175; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm14
4176; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
4177; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
4178; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm13
4179; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
4180; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
4181; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
4182; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
4183; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
4184; AVX512DQ-BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4185; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm14
4186; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
4187; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm15
4188; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
4189; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
4190; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm14
4191; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
4192; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
4193; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
4194; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
4195; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
4196; AVX512DQ-BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4197; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm15
4198; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
4199; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm16
4200; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
4201; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
4202; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm15
4203; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
4204; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
4205; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
4206; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
4207; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
4208; AVX512DQ-BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4209; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
4210; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
4211; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
4212; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
4213; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
4214; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4215; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
4216; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
4217; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
4218; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%rcx)
4219; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, (%r8)
4220; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, (%r9)
4221; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, (%r11)
4222; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, (%r10)
4223; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rax)
4224; AVX512DQ-BW-NEXT:    vzeroupper
4225; AVX512DQ-BW-NEXT:    retq
4226;
4227; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf16:
4228; AVX512DQ-BW-FCP:       # %bb.0:
4229; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4230; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4231; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
4232; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
4233; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
4234; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
4235; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
4236; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
4237; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm3
4238; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
4239; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
4240; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
4241; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4242; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
4243; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
4244; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
4245; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
4246; AVX512DQ-BW-FCP-NEXT:    movb $-64, %dil
4247; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
4248; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
4249; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
4250; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
4251; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
4252; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
4253; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
4254; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
4255; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4256; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
4257; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
4258; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm11
4259; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
4260; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
4261; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
4262; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
4263; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
4264; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
4265; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
4266; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
4267; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4268; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
4269; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
4270; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
4271; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
4272; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
4273; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
4274; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
4275; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
4276; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4277; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
4278; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
4279; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4280; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
4281; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
4282; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
4283; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
4284; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
4285; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm12
4286; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
4287; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
4288; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
4289; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
4290; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
4291; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4292; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
4293; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
4294; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm14
4295; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
4296; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
4297; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13
4298; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
4299; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
4300; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
4301; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
4302; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
4303; AVX512DQ-BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4304; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14
4305; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
4306; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15
4307; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
4308; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
4309; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm14
4310; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
4311; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
4312; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
4313; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
4314; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
4315; AVX512DQ-BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4316; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm15
4317; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
4318; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm16
4319; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
4320; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
4321; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm15
4322; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
4323; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
4324; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
4325; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
4326; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
4327; AVX512DQ-BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4328; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
4329; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
4330; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
4331; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
4332; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
4333; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4334; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
4335; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
4336; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rdx)
4337; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rcx)
4338; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, (%r8)
4339; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%r9)
4340; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r11)
4341; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%r10)
4342; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
4343; AVX512DQ-BW-FCP-NEXT:    vzeroupper
4344; AVX512DQ-BW-FCP-NEXT:    retq
4345  %wide.vec = load <128 x i32>, ptr %in.vec, align 64
4346  %strided.vec0 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
4347  %strided.vec1 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121>
4348  %strided.vec2 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122>
4349  %strided.vec3 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123>
4350  %strided.vec4 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124>
4351  %strided.vec5 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
4352  %strided.vec6 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
4353  %strided.vec7 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
4354  store <16 x i32> %strided.vec0, ptr %out.vec0, align 64
4355  store <16 x i32> %strided.vec1, ptr %out.vec1, align 64
4356  store <16 x i32> %strided.vec2, ptr %out.vec2, align 64
4357  store <16 x i32> %strided.vec3, ptr %out.vec3, align 64
4358  store <16 x i32> %strided.vec4, ptr %out.vec4, align 64
4359  store <16 x i32> %strided.vec5, ptr %out.vec5, align 64
4360  store <16 x i32> %strided.vec6, ptr %out.vec6, align 64
4361  store <16 x i32> %strided.vec7, ptr %out.vec7, align 64
4362  ret void
4363}
4364
4365define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
4366; SSE-LABEL: load_i32_stride8_vf32:
4367; SSE:       # %bb.0:
4368; SSE-NEXT:    subq $952, %rsp # imm = 0x3B8
4369; SSE-NEXT:    movaps 544(%rdi), %xmm5
4370; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4371; SSE-NEXT:    movaps 608(%rdi), %xmm6
4372; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4373; SSE-NEXT:    movaps 576(%rdi), %xmm7
4374; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4375; SSE-NEXT:    movaps 672(%rdi), %xmm8
4376; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4377; SSE-NEXT:    movaps 640(%rdi), %xmm4
4378; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4379; SSE-NEXT:    movaps 736(%rdi), %xmm9
4380; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4381; SSE-NEXT:    movaps 704(%rdi), %xmm3
4382; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4383; SSE-NEXT:    movaps 160(%rdi), %xmm10
4384; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4385; SSE-NEXT:    movaps 128(%rdi), %xmm1
4386; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4387; SSE-NEXT:    movaps 224(%rdi), %xmm2
4388; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4389; SSE-NEXT:    movaps 192(%rdi), %xmm0
4390; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4391; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4392; SSE-NEXT:    movaps %xmm1, %xmm2
4393; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
4394; SSE-NEXT:    movaps %xmm2, %xmm1
4395; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4396; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4397; SSE-NEXT:    movaps %xmm3, %xmm1
4398; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
4399; SSE-NEXT:    movaps %xmm4, %xmm3
4400; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
4401; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4402; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4403; SSE-NEXT:    movaps %xmm3, %xmm0
4404; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4405; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4406; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
4407; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4408; SSE-NEXT:    movaps %xmm7, %xmm0
4409; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
4410; SSE-NEXT:    movaps 512(%rdi), %xmm1
4411; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4412; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
4413; SSE-NEXT:    movaps %xmm1, %xmm2
4414; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4415; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4416; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4417; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4418; SSE-NEXT:    movaps 480(%rdi), %xmm1
4419; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4420; SSE-NEXT:    movaps 448(%rdi), %xmm10
4421; SSE-NEXT:    movaps %xmm10, %xmm0
4422; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4423; SSE-NEXT:    movaps 416(%rdi), %xmm3
4424; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4425; SSE-NEXT:    movaps 384(%rdi), %xmm2
4426; SSE-NEXT:    movaps %xmm2, %xmm1
4427; SSE-NEXT:    movaps %xmm2, %xmm14
4428; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4429; SSE-NEXT:    movaps %xmm1, %xmm2
4430; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4431; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4432; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4433; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4434; SSE-NEXT:    movaps 992(%rdi), %xmm1
4435; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4436; SSE-NEXT:    movaps 960(%rdi), %xmm15
4437; SSE-NEXT:    movaps %xmm15, %xmm0
4438; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4439; SSE-NEXT:    movaps 928(%rdi), %xmm2
4440; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4441; SSE-NEXT:    movaps 896(%rdi), %xmm1
4442; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4443; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4444; SSE-NEXT:    movaps %xmm1, %xmm2
4445; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4446; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4447; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4448; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4449; SSE-NEXT:    movaps 352(%rdi), %xmm1
4450; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4451; SSE-NEXT:    movaps 320(%rdi), %xmm12
4452; SSE-NEXT:    movaps %xmm12, %xmm0
4453; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4454; SSE-NEXT:    movaps 288(%rdi), %xmm3
4455; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4456; SSE-NEXT:    movaps 256(%rdi), %xmm1
4457; SSE-NEXT:    movaps %xmm1, %xmm2
4458; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4459; SSE-NEXT:    movaps %xmm2, %xmm3
4460; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
4461; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4462; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4463; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4464; SSE-NEXT:    movaps 864(%rdi), %xmm2
4465; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4466; SSE-NEXT:    movaps 832(%rdi), %xmm11
4467; SSE-NEXT:    movaps %xmm11, %xmm0
4468; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4469; SSE-NEXT:    movaps 800(%rdi), %xmm4
4470; SSE-NEXT:    movaps %xmm4, (%rsp) # 16-byte Spill
4471; SSE-NEXT:    movaps 768(%rdi), %xmm2
4472; SSE-NEXT:    movaps %xmm2, %xmm3
4473; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
4474; SSE-NEXT:    movaps %xmm3, %xmm4
4475; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
4476; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4477; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
4478; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4479; SSE-NEXT:    movaps 96(%rdi), %xmm6
4480; SSE-NEXT:    movaps 64(%rdi), %xmm9
4481; SSE-NEXT:    movaps %xmm9, %xmm13
4482; SSE-NEXT:    unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1]
4483; SSE-NEXT:    movaps (%rdi), %xmm8
4484; SSE-NEXT:    movaps 32(%rdi), %xmm3
4485; SSE-NEXT:    movaps %xmm8, %xmm7
4486; SSE-NEXT:    unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
4487; SSE-NEXT:    movaps %xmm7, %xmm5
4488; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0]
4489; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4490; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1]
4491; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4492; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4493; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4494; SSE-NEXT:    # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4495; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4496; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4497; SSE-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4498; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4499; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
4500; SSE-NEXT:    # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
4501; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4502; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4503; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
4504; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4505; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4506; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4507; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
4508; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4509; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4510; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
4511; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4512; SSE-NEXT:    # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
4513; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4514; SSE-NEXT:    # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
4515; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4516; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4517; SSE-NEXT:    # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
4518; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4519; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4520; SSE-NEXT:    # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
4521; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4522; SSE-NEXT:    # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
4523; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4524; SSE-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4525; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4526; SSE-NEXT:    # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
4527; SSE-NEXT:    unpckhps (%rsp), %xmm2 # 16-byte Folded Reload
4528; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
4529; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
4530; SSE-NEXT:    unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3]
4531; SSE-NEXT:    movaps %xmm5, %xmm3
4532; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
4533; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4534; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1]
4535; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4536; SSE-NEXT:    movaps %xmm1, %xmm3
4537; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0]
4538; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4539; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
4540; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4541; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4542; SSE-NEXT:    movaps %xmm6, %xmm1
4543; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
4544; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4545; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1]
4546; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4547; SSE-NEXT:    movaps %xmm0, %xmm3
4548; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0]
4549; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4550; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
4551; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4552; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4553; SSE-NEXT:    movaps %xmm1, %xmm3
4554; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0]
4555; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4556; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
4557; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4558; SSE-NEXT:    movaps %xmm2, %xmm1
4559; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0]
4560; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4561; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1]
4562; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4563; SSE-NEXT:    movaps %xmm14, %xmm1
4564; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0]
4565; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4566; SSE-NEXT:    unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1]
4567; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4568; SSE-NEXT:    movaps %xmm8, %xmm1
4569; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0]
4570; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4571; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1]
4572; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4573; SSE-NEXT:    movaps 240(%rdi), %xmm2
4574; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4575; SSE-NEXT:    movaps 208(%rdi), %xmm12
4576; SSE-NEXT:    movaps %xmm12, %xmm0
4577; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4578; SSE-NEXT:    movaps 176(%rdi), %xmm13
4579; SSE-NEXT:    movaps 144(%rdi), %xmm2
4580; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4581; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
4582; SSE-NEXT:    movaps %xmm2, %xmm1
4583; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4584; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4585; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4586; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4587; SSE-NEXT:    movaps 368(%rdi), %xmm4
4588; SSE-NEXT:    movaps 336(%rdi), %xmm1
4589; SSE-NEXT:    movaps %xmm1, %xmm0
4590; SSE-NEXT:    movaps %xmm1, %xmm9
4591; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
4592; SSE-NEXT:    movaps 304(%rdi), %xmm5
4593; SSE-NEXT:    movaps 272(%rdi), %xmm8
4594; SSE-NEXT:    movaps %xmm8, %xmm1
4595; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
4596; SSE-NEXT:    movaps %xmm1, %xmm2
4597; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4598; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4599; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4600; SSE-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
4601; SSE-NEXT:    movaps 496(%rdi), %xmm7
4602; SSE-NEXT:    movaps 464(%rdi), %xmm0
4603; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4604; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
4605; SSE-NEXT:    movaps 432(%rdi), %xmm2
4606; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4607; SSE-NEXT:    movaps 400(%rdi), %xmm1
4608; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4609; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4610; SSE-NEXT:    movaps %xmm1, %xmm2
4611; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4612; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4613; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4614; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4615; SSE-NEXT:    movaps 624(%rdi), %xmm2
4616; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4617; SSE-NEXT:    movaps 592(%rdi), %xmm1
4618; SSE-NEXT:    movaps %xmm1, %xmm0
4619; SSE-NEXT:    movaps %xmm1, %xmm3
4620; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4621; SSE-NEXT:    movaps 560(%rdi), %xmm6
4622; SSE-NEXT:    movaps 528(%rdi), %xmm1
4623; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4624; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
4625; SSE-NEXT:    movaps %xmm1, %xmm2
4626; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4627; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4628; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4629; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4630; SSE-NEXT:    movaps 752(%rdi), %xmm1
4631; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4632; SSE-NEXT:    movaps 720(%rdi), %xmm0
4633; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4634; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4635; SSE-NEXT:    movaps 688(%rdi), %xmm2
4636; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4637; SSE-NEXT:    movaps 656(%rdi), %xmm1
4638; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4639; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4640; SSE-NEXT:    movaps %xmm1, %xmm2
4641; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4642; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4643; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4644; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4645; SSE-NEXT:    movaps 880(%rdi), %xmm1
4646; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4647; SSE-NEXT:    movaps 848(%rdi), %xmm14
4648; SSE-NEXT:    movaps %xmm14, %xmm0
4649; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4650; SSE-NEXT:    movaps 816(%rdi), %xmm1
4651; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4652; SSE-NEXT:    movaps 784(%rdi), %xmm2
4653; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4654; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4655; SSE-NEXT:    movaps %xmm2, %xmm1
4656; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4657; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4658; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4659; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4660; SSE-NEXT:    movaps 1008(%rdi), %xmm1
4661; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4662; SSE-NEXT:    movaps 976(%rdi), %xmm11
4663; SSE-NEXT:    movaps %xmm11, %xmm0
4664; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4665; SSE-NEXT:    movaps 944(%rdi), %xmm1
4666; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4667; SSE-NEXT:    movaps 912(%rdi), %xmm2
4668; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4669; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4670; SSE-NEXT:    movaps %xmm2, %xmm1
4671; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4672; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4673; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4674; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4675; SSE-NEXT:    movaps 112(%rdi), %xmm1
4676; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4677; SSE-NEXT:    movaps 80(%rdi), %xmm10
4678; SSE-NEXT:    movaps %xmm10, %xmm0
4679; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4680; SSE-NEXT:    movaps 16(%rdi), %xmm15
4681; SSE-NEXT:    movaps 48(%rdi), %xmm1
4682; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4683; SSE-NEXT:    movaps %xmm15, %xmm2
4684; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4685; SSE-NEXT:    movaps %xmm2, %xmm1
4686; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4687; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4688; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4689; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4690; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4691; SSE-NEXT:    # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
4692; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4693; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
4694; SSE-NEXT:    movaps %xmm9, %xmm13
4695; SSE-NEXT:    unpckhps {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
4696; SSE-NEXT:    unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
4697; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4698; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
4699; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4700; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4701; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
4702; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4703; SSE-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
4704; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4705; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4706; SSE-NEXT:    unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
4707; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4708; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4709; SSE-NEXT:    # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
4710; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4711; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4712; SSE-NEXT:    # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4713; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4714; SSE-NEXT:    # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
4715; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4716; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4717; SSE-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
4718; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4719; SSE-NEXT:    # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
4720; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4721; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4722; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
4723; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4724; SSE-NEXT:    # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
4725; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4726; SSE-NEXT:    # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
4727; SSE-NEXT:    movaps %xmm1, %xmm0
4728; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0]
4729; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4730; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
4731; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4732; SSE-NEXT:    movaps %xmm8, %xmm1
4733; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0]
4734; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4735; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1]
4736; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4737; SSE-NEXT:    movaps %xmm7, %xmm12
4738; SSE-NEXT:    movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0]
4739; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1]
4740; SSE-NEXT:    movaps %xmm5, %xmm13
4741; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4742; SSE-NEXT:    movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0]
4743; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
4744; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4745; SSE-NEXT:    movaps %xmm4, %xmm0
4746; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0]
4747; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
4748; SSE-NEXT:    movaps %xmm0, %xmm5
4749; SSE-NEXT:    movaps %xmm3, %xmm8
4750; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0]
4751; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1]
4752; SSE-NEXT:    movaps %xmm3, %xmm6
4753; SSE-NEXT:    movaps %xmm2, %xmm3
4754; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0]
4755; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1]
4756; SSE-NEXT:    movaps %xmm15, %xmm0
4757; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0]
4758; SSE-NEXT:    unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1]
4759; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4760; SSE-NEXT:    movaps %xmm1, 96(%rsi)
4761; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4762; SSE-NEXT:    movaps %xmm1, 32(%rsi)
4763; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4764; SSE-NEXT:    movaps %xmm1, 112(%rsi)
4765; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4766; SSE-NEXT:    movaps %xmm1, 48(%rsi)
4767; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4768; SSE-NEXT:    movaps %xmm1, 64(%rsi)
4769; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4770; SSE-NEXT:    movaps %xmm1, (%rsi)
4771; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4772; SSE-NEXT:    movaps %xmm1, 80(%rsi)
4773; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4774; SSE-NEXT:    movaps %xmm1, 16(%rsi)
4775; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4776; SSE-NEXT:    movaps %xmm1, 96(%rdx)
4777; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4778; SSE-NEXT:    movaps %xmm1, 32(%rdx)
4779; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4780; SSE-NEXT:    movaps %xmm1, 112(%rdx)
4781; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4782; SSE-NEXT:    movaps %xmm1, 48(%rdx)
4783; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4784; SSE-NEXT:    movaps %xmm1, 64(%rdx)
4785; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4786; SSE-NEXT:    movaps %xmm1, (%rdx)
4787; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4788; SSE-NEXT:    movaps %xmm1, 80(%rdx)
4789; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4790; SSE-NEXT:    movaps %xmm1, 16(%rdx)
4791; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4792; SSE-NEXT:    movaps %xmm1, 96(%rcx)
4793; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4794; SSE-NEXT:    movaps %xmm1, 32(%rcx)
4795; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4796; SSE-NEXT:    movaps %xmm1, 112(%rcx)
4797; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4798; SSE-NEXT:    movaps %xmm1, 48(%rcx)
4799; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4800; SSE-NEXT:    movaps %xmm1, 64(%rcx)
4801; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4802; SSE-NEXT:    movaps %xmm1, (%rcx)
4803; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4804; SSE-NEXT:    movaps %xmm1, 80(%rcx)
4805; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4806; SSE-NEXT:    movaps %xmm1, 16(%rcx)
4807; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4808; SSE-NEXT:    movaps %xmm1, 112(%r8)
4809; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4810; SSE-NEXT:    movaps %xmm1, 96(%r8)
4811; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4812; SSE-NEXT:    movaps %xmm1, 80(%r8)
4813; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4814; SSE-NEXT:    movaps %xmm1, 64(%r8)
4815; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4816; SSE-NEXT:    movaps %xmm1, 48(%r8)
4817; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4818; SSE-NEXT:    movaps %xmm1, 32(%r8)
4819; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4820; SSE-NEXT:    movaps %xmm1, 16(%r8)
4821; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4822; SSE-NEXT:    movaps %xmm1, (%r8)
4823; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4824; SSE-NEXT:    movaps %xmm1, 112(%r9)
4825; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4826; SSE-NEXT:    movaps %xmm1, 96(%r9)
4827; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4828; SSE-NEXT:    movaps %xmm1, 80(%r9)
4829; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4830; SSE-NEXT:    movaps %xmm1, 64(%r9)
4831; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4832; SSE-NEXT:    movaps %xmm1, 48(%r9)
4833; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4834; SSE-NEXT:    movaps %xmm1, 32(%r9)
4835; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4836; SSE-NEXT:    movaps %xmm1, 16(%r9)
4837; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4838; SSE-NEXT:    movaps %xmm1, (%r9)
4839; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4840; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4841; SSE-NEXT:    movaps %xmm1, 112(%rax)
4842; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4843; SSE-NEXT:    movaps %xmm1, 96(%rax)
4844; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4845; SSE-NEXT:    movaps %xmm1, 80(%rax)
4846; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4847; SSE-NEXT:    movaps %xmm1, 64(%rax)
4848; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4849; SSE-NEXT:    movaps %xmm1, 48(%rax)
4850; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
4851; SSE-NEXT:    movaps %xmm1, 32(%rax)
4852; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4853; SSE-NEXT:    movaps %xmm1, 16(%rax)
4854; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4855; SSE-NEXT:    movaps %xmm1, (%rax)
4856; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4857; SSE-NEXT:    movaps %xmm3, 112(%rax)
4858; SSE-NEXT:    movaps %xmm8, 96(%rax)
4859; SSE-NEXT:    movaps %xmm4, 80(%rax)
4860; SSE-NEXT:    movaps %xmm13, 64(%rax)
4861; SSE-NEXT:    movaps %xmm12, 48(%rax)
4862; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4863; SSE-NEXT:    movaps %xmm1, 32(%rax)
4864; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4865; SSE-NEXT:    movaps %xmm1, 16(%rax)
4866; SSE-NEXT:    movaps %xmm0, (%rax)
4867; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4868; SSE-NEXT:    movaps %xmm2, 112(%rax)
4869; SSE-NEXT:    movaps %xmm6, 96(%rax)
4870; SSE-NEXT:    movaps %xmm5, 80(%rax)
4871; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4872; SSE-NEXT:    movaps %xmm0, 64(%rax)
4873; SSE-NEXT:    movaps %xmm7, 48(%rax)
4874; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4875; SSE-NEXT:    movaps %xmm0, 32(%rax)
4876; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4877; SSE-NEXT:    movaps %xmm0, 16(%rax)
4878; SSE-NEXT:    movaps %xmm15, (%rax)
4879; SSE-NEXT:    addq $952, %rsp # imm = 0x3B8
4880; SSE-NEXT:    retq
4881;
4882; AVX-LABEL: load_i32_stride8_vf32:
4883; AVX:       # %bb.0:
4884; AVX-NEXT:    subq $1768, %rsp # imm = 0x6E8
4885; AVX-NEXT:    vmovaps 288(%rdi), %xmm14
4886; AVX-NEXT:    vmovaps 256(%rdi), %xmm10
4887; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1]
4888; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4889; AVX-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4890; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
4891; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4892; AVX-NEXT:    vmovaps 320(%rdi), %xmm2
4893; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4894; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4895; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4896; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4897; AVX-NEXT:    vmovaps 416(%rdi), %xmm1
4898; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
4899; AVX-NEXT:    vmovaps 384(%rdi), %xmm2
4900; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4901; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4902; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
4903; AVX-NEXT:    vmovaps 480(%rdi), %xmm2
4904; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4905; AVX-NEXT:    vmovaps 448(%rdi), %xmm3
4906; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4907; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4908; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4909; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
4910; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
4911; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
4912; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4913; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4914; AVX-NEXT:    vmovaps 928(%rdi), %xmm0
4915; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4916; AVX-NEXT:    vmovaps 896(%rdi), %xmm1
4917; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4918; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4919; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4920; AVX-NEXT:    vmovaps 992(%rdi), %xmm1
4921; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4922; AVX-NEXT:    vmovaps 960(%rdi), %xmm2
4923; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4924; AVX-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4925; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[0,1,0,1]
4926; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
4927; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4928; AVX-NEXT:    vmovaps 800(%rdi), %xmm1
4929; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4930; AVX-NEXT:    vmovaps 768(%rdi), %xmm11
4931; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
4932; AVX-NEXT:    vmovaps 864(%rdi), %xmm2
4933; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4934; AVX-NEXT:    vmovaps 832(%rdi), %xmm3
4935; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4936; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4937; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0]
4938; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4939; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4940; AVX-NEXT:    vmovaps 160(%rdi), %xmm1
4941; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4942; AVX-NEXT:    vmovaps 128(%rdi), %xmm0
4943; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4944; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4945; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4946; AVX-NEXT:    vmovaps 224(%rdi), %xmm1
4947; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4948; AVX-NEXT:    vmovaps 192(%rdi), %xmm2
4949; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4950; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4951; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4952; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
4953; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
4954; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4955; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
4956; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4957; AVX-NEXT:    vmovaps (%rdi), %xmm13
4958; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
4959; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4960; AVX-NEXT:    vmovaps 96(%rdi), %xmm2
4961; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4962; AVX-NEXT:    vmovaps 64(%rdi), %xmm3
4963; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4964; AVX-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4965; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
4966; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4967; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4968; AVX-NEXT:    vmovaps 672(%rdi), %xmm12
4969; AVX-NEXT:    vmovaps 640(%rdi), %xmm0
4970; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4971; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
4972; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4973; AVX-NEXT:    vmovaps 736(%rdi), %xmm1
4974; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4975; AVX-NEXT:    vmovaps 704(%rdi), %xmm2
4976; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4977; AVX-NEXT:    vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4978; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1]
4979; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
4980; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7]
4981; AVX-NEXT:    vmovaps 544(%rdi), %xmm6
4982; AVX-NEXT:    vmovaps 512(%rdi), %xmm3
4983; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
4984; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4985; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4986; AVX-NEXT:    vmovaps 608(%rdi), %xmm4
4987; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4988; AVX-NEXT:    vmovaps 576(%rdi), %xmm2
4989; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4990; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
4991; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4992; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4993; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4994; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
4995; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
4996; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4997; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
4998; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
4999; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5000; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1]
5001; AVX-NEXT:    vmovaps (%rsp), %xmm14 # 16-byte Reload
5002; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3]
5003; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
5004; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7]
5005; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5006; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5007; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
5008; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5009; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
5010; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
5011; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm1
5012; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5013; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm9[1,1,1,1]
5014; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5015; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2,3]
5016; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
5017; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
5018; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5019; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5020; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1]
5021; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5022; AVX-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
5023; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5024; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
5025; AVX-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5026; AVX-NEXT:    # xmm5 = mem[1,1,1,1]
5027; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5028; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3]
5029; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
5030; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
5031; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5032; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5033; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1]
5034; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
5035; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
5036; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm1
5037; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5038; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1]
5039; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3]
5040; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5041; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5042; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5043; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5044; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5045; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5046; AVX-NEXT:    # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
5047; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5048; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm14[2],xmm4[3],xmm14[3]
5049; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5050; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5051; AVX-NEXT:    # xmm0 = mem[2,2,2,2]
5052; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5053; AVX-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
5054; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5055; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
5056; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5057; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5058; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2]
5059; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5060; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
5061; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5062; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5063; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5064; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
5065; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5066; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm15[2],xmm9[3],xmm15[3]
5067; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
5068; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5069; AVX-NEXT:    # xmm2 = mem[2,2,2,2]
5070; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5071; AVX-NEXT:    # xmm2 = mem[0,1,2],xmm2[3]
5072; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5073; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm3
5074; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5075; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5076; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm14[2,2,2,2]
5077; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5078; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3]
5079; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5080; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5081; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5082; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5083; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload
5084; AVX-NEXT:    # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3]
5085; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5086; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5087; AVX-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
5088; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5089; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5090; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2]
5091; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5092; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
5093; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5094; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm6
5095; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
5096; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5097; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2]
5098; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5099; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
5100; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
5101; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
5102; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5103; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5104; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload
5105; AVX-NEXT:    # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3]
5106; AVX-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm12[2],xmm5[3],xmm12[3]
5107; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5108; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2]
5109; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5110; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3]
5111; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
5112; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm8
5113; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
5114; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5115; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2]
5116; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5117; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3]
5118; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
5119; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
5120; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5121; AVX-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
5122; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5123; AVX-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm6[1]
5124; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5125; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
5126; AVX-NEXT:    # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3]
5127; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
5128; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5129; AVX-NEXT:    # xmm1 = mem[2,3,2,3]
5130; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5131; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
5132; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
5133; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5134; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
5135; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5136; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1]
5137; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5138; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
5139; AVX-NEXT:    # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3]
5140; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
5141; AVX-NEXT:    vpermilps $238, (%rsp), %xmm0 # 16-byte Folded Reload
5142; AVX-NEXT:    # xmm0 = mem[2,3,2,3]
5143; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5144; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
5145; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5146; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5147; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm9[2],xmm3[3],xmm9[3]
5148; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5149; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5150; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
5151; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5152; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5153; AVX-NEXT:    # xmm3 = mem[2,3,2,3]
5154; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
5155; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
5156; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5157; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5158; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5159; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm0[1]
5160; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
5161; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5162; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
5163; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5164; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5165; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5166; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5167; AVX-NEXT:    vmovaps 416(%rdi), %ymm12
5168; AVX-NEXT:    vmovaps 384(%rdi), %ymm9
5169; AVX-NEXT:    vmovaps 448(%rdi), %ymm7
5170; AVX-NEXT:    vmovaps 480(%rdi), %ymm11
5171; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm7[0],ymm11[2],ymm7[2]
5172; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5173; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[4],ymm12[4],ymm9[5],ymm12[5]
5174; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5175; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5176; AVX-NEXT:    vmovaps 320(%rdi), %ymm2
5177; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5178; AVX-NEXT:    vmovaps 352(%rdi), %ymm1
5179; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5180; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
5181; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5182; AVX-NEXT:    vmovaps 288(%rdi), %ymm2
5183; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5184; AVX-NEXT:    vmovaps 256(%rdi), %ymm10
5185; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[4],ymm2[4],ymm10[5],ymm2[5]
5186; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5187; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
5188; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
5189; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5190; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5191; AVX-NEXT:    vmovaps 928(%rdi), %ymm5
5192; AVX-NEXT:    vmovaps 896(%rdi), %ymm3
5193; AVX-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
5194; AVX-NEXT:    vmovaps 960(%rdi), %ymm1
5195; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5196; AVX-NEXT:    vmovaps 992(%rdi), %ymm0
5197; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5198; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5199; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5]
5200; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5201; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5202; AVX-NEXT:    vmovaps 832(%rdi), %ymm2
5203; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5204; AVX-NEXT:    vmovaps 864(%rdi), %ymm1
5205; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5206; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
5207; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5208; AVX-NEXT:    vmovaps 800(%rdi), %ymm3
5209; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5210; AVX-NEXT:    vmovaps 768(%rdi), %ymm2
5211; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5212; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
5213; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
5214; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
5215; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5216; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5217; AVX-NEXT:    vmovaps 672(%rdi), %ymm2
5218; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5219; AVX-NEXT:    vmovaps 640(%rdi), %ymm3
5220; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5221; AVX-NEXT:    vmovaps 704(%rdi), %ymm13
5222; AVX-NEXT:    vmovaps 736(%rdi), %ymm0
5223; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5224; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2]
5225; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5226; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5227; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5228; AVX-NEXT:    vmovaps 576(%rdi), %ymm1
5229; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5230; AVX-NEXT:    vmovaps 608(%rdi), %ymm0
5231; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5232; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5233; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm6
5234; AVX-NEXT:    vmovaps 544(%rdi), %ymm0
5235; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5236; AVX-NEXT:    vmovaps 512(%rdi), %ymm1
5237; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5238; AVX-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5239; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
5240; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,0]
5241; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7]
5242; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5243; AVX-NEXT:    vmovaps 160(%rdi), %ymm0
5244; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5245; AVX-NEXT:    vmovaps 128(%rdi), %ymm1
5246; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5247; AVX-NEXT:    vmovaps 192(%rdi), %ymm2
5248; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5249; AVX-NEXT:    vmovaps 224(%rdi), %ymm3
5250; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5251; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
5252; AVX-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5253; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4]
5254; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
5255; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5256; AVX-NEXT:    vmovaps 96(%rdi), %ymm2
5257; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5258; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
5259; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm0
5260; AVX-NEXT:    vmovaps (%rdi), %ymm2
5261; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5262; AVX-NEXT:    vmovaps 32(%rdi), %ymm14
5263; AVX-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[4],ymm14[4],ymm2[5],ymm14[5]
5264; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5265; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0]
5266; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5267; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5268; AVX-NEXT:    vmovaps %ymm11, %ymm6
5269; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5270; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[4],ymm11[4],ymm7[5],ymm11[5]
5271; AVX-NEXT:    vmovaps %ymm12, %ymm7
5272; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5273; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm9[1,0],ymm12[5,4],ymm9[5,4]
5274; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5275; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5276; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5277; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[4],ymm9[4],ymm3[5],ymm9[5]
5278; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5279; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5280; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4]
5281; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5282; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3]
5283; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5284; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5285; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5286; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5287; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[4],ymm10[4],ymm2[5],ymm10[5]
5288; AVX-NEXT:    vmovups (%rsp), %ymm12 # 32-byte Reload
5289; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm12[1,0],ymm5[5,4],ymm12[5,4]
5290; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5291; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5292; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
5293; AVX-NEXT:    # ymm1 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5]
5294; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5295; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5296; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload
5297; AVX-NEXT:    # ymm15 = ymm8[1,0],mem[1,0],ymm8[5,4],mem[5,4]
5298; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5299; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3]
5300; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5301; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5302; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5303; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5]
5304; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5305; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
5306; AVX-NEXT:    # ymm1 = ymm13[1,0],mem[1,0],ymm13[5,4],mem[5,4]
5307; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5308; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5309; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5310; AVX-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
5311; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5312; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5313; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
5314; AVX-NEXT:    # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
5315; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5316; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3]
5317; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5318; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5319; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5320; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5321; AVX-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
5322; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5323; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5324; AVX-NEXT:    # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
5325; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5326; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5327; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5328; AVX-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
5329; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5330; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload
5331; AVX-NEXT:    # ymm15 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
5332; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5333; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3]
5334; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5335; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5336; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
5337; AVX-NEXT:    # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3]
5338; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5339; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7]
5340; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5341; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm3[1],ymm9[3],ymm3[3]
5342; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5343; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5344; AVX-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7]
5345; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5346; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,0]
5347; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5348; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5349; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm2[1],ymm10[3],ymm2[3]
5350; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload
5351; AVX-NEXT:    # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7]
5352; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5353; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5354; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3]
5355; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5356; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5357; AVX-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm8[2],ymm2[3],ymm8[3],ymm2[6],ymm8[6],ymm2[7],ymm8[7]
5358; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5359; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,0]
5360; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5361; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5362; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5363; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
5364; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5365; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7]
5366; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5367; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5368; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5369; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm2[1],ymm6[3],ymm2[3]
5370; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5371; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5372; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5373; AVX-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7]
5374; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5375; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,0]
5376; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5377; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5378; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5379; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5380; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
5381; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5382; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5383; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7]
5384; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5385; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5386; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5387; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3]
5388; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5389; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5390; AVX-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
5391; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5392; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,0]
5393; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5394; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5395; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5396; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5397; AVX-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
5398; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5399; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5400; AVX-NEXT:    # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
5401; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5402; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5403; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5404; AVX-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
5405; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5406; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[3,0],ymm9[3,0],ymm15[7,4],ymm9[7,4]
5407; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5408; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
5409; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3]
5410; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5411; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
5412; AVX-NEXT:    # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7]
5413; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5414; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm4[3,0],ymm1[7,4],ymm4[7,4]
5415; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5416; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7]
5417; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm7[3,0],ymm3[7,4],ymm7[7,4]
5418; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5419; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
5420; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
5421; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5422; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5423; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5424; AVX-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
5425; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5426; AVX-NEXT:    vshufps $51, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
5427; AVX-NEXT:    # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
5428; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7]
5429; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5430; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
5431; AVX-NEXT:    # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
5432; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5433; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
5434; AVX-NEXT:    # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4]
5435; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
5436; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
5437; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,3]
5438; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5439; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload
5440; AVX-NEXT:    # ymm2 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
5441; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[3,0],ymm12[3,0],ymm10[7,4],ymm12[7,4]
5442; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7]
5443; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
5444; AVX-NEXT:    # ymm3 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
5445; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[3,0],ymm13[3,0],ymm14[7,4],ymm13[7,4]
5446; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
5447; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
5448; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3]
5449; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5450; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5451; AVX-NEXT:    vmovaps %ymm3, 64(%rsi)
5452; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5453; AVX-NEXT:    vmovaps %ymm3, (%rsi)
5454; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5455; AVX-NEXT:    vmovaps %ymm3, 96(%rsi)
5456; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5457; AVX-NEXT:    vmovaps %ymm3, 32(%rsi)
5458; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5459; AVX-NEXT:    vmovaps %ymm3, 64(%rdx)
5460; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5461; AVX-NEXT:    vmovaps %ymm3, (%rdx)
5462; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5463; AVX-NEXT:    vmovaps %ymm3, 96(%rdx)
5464; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5465; AVX-NEXT:    vmovaps %ymm3, 32(%rdx)
5466; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5467; AVX-NEXT:    vmovaps %ymm3, 64(%rcx)
5468; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5469; AVX-NEXT:    vmovaps %ymm3, (%rcx)
5470; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5471; AVX-NEXT:    vmovaps %ymm3, 96(%rcx)
5472; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5473; AVX-NEXT:    vmovaps %ymm3, 32(%rcx)
5474; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5475; AVX-NEXT:    vmovaps %ymm3, 64(%r8)
5476; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5477; AVX-NEXT:    vmovaps %ymm3, (%r8)
5478; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5479; AVX-NEXT:    vmovaps %ymm3, 96(%r8)
5480; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5481; AVX-NEXT:    vmovaps %ymm3, 32(%r8)
5482; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5483; AVX-NEXT:    vmovaps %ymm3, 64(%r9)
5484; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5485; AVX-NEXT:    vmovaps %ymm3, (%r9)
5486; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5487; AVX-NEXT:    vmovaps %ymm3, 96(%r9)
5488; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5489; AVX-NEXT:    vmovaps %ymm3, 32(%r9)
5490; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5491; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5492; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
5493; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5494; AVX-NEXT:    vmovaps %ymm3, (%rax)
5495; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5496; AVX-NEXT:    vmovaps %ymm3, 96(%rax)
5497; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5498; AVX-NEXT:    vmovaps %ymm3, 32(%rax)
5499; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5500; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5501; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
5502; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5503; AVX-NEXT:    vmovaps %ymm3, (%rax)
5504; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5505; AVX-NEXT:    vmovaps %ymm3, 96(%rax)
5506; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5507; AVX-NEXT:    vmovaps %ymm3, 32(%rax)
5508; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5509; AVX-NEXT:    vmovaps %ymm1, 96(%rax)
5510; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
5511; AVX-NEXT:    vmovaps %ymm9, 32(%rax)
5512; AVX-NEXT:    vmovaps %ymm2, (%rax)
5513; AVX-NEXT:    addq $1768, %rsp # imm = 0x6E8
5514; AVX-NEXT:    vzeroupper
5515; AVX-NEXT:    retq
5516;
5517; AVX2-LABEL: load_i32_stride8_vf32:
5518; AVX2:       # %bb.0:
5519; AVX2-NEXT:    subq $1544, %rsp # imm = 0x608
5520; AVX2-NEXT:    vmovaps 288(%rdi), %xmm0
5521; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5522; AVX2-NEXT:    vmovaps 256(%rdi), %xmm10
5523; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
5524; AVX2-NEXT:    vmovaps 352(%rdi), %xmm14
5525; AVX2-NEXT:    vbroadcastss %xmm14, %xmm1
5526; AVX2-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5527; AVX2-NEXT:    vmovaps 320(%rdi), %xmm15
5528; AVX2-NEXT:    vbroadcastss %xmm15, %xmm2
5529; AVX2-NEXT:    vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5530; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5531; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5532; AVX2-NEXT:    vmovaps 416(%rdi), %xmm1
5533; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5534; AVX2-NEXT:    vmovaps 384(%rdi), %xmm12
5535; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
5536; AVX2-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5537; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5538; AVX2-NEXT:    vmovaps 480(%rdi), %xmm2
5539; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5540; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
5541; AVX2-NEXT:    vmovaps 448(%rdi), %xmm3
5542; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5543; AVX2-NEXT:    vbroadcastss %xmm3, %xmm3
5544; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5545; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5546; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5547; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5548; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5549; AVX2-NEXT:    vmovaps 800(%rdi), %xmm1
5550; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5551; AVX2-NEXT:    vmovaps 768(%rdi), %xmm0
5552; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5553; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5554; AVX2-NEXT:    vmovaps 864(%rdi), %xmm1
5555; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5556; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
5557; AVX2-NEXT:    vmovaps 832(%rdi), %xmm2
5558; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5559; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
5560; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5561; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5562; AVX2-NEXT:    vmovaps 992(%rdi), %xmm1
5563; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5564; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
5565; AVX2-NEXT:    vmovaps 960(%rdi), %xmm11
5566; AVX2-NEXT:    vbroadcastss %xmm11, %xmm2
5567; AVX2-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5568; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5569; AVX2-NEXT:    vmovaps 928(%rdi), %xmm2
5570; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5571; AVX2-NEXT:    vmovaps 896(%rdi), %xmm3
5572; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5573; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5574; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5575; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5576; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5577; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5578; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5579; AVX2-NEXT:    vmovaps 608(%rdi), %xmm0
5580; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5581; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
5582; AVX2-NEXT:    vmovaps 576(%rdi), %xmm1
5583; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5584; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
5585; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5586; AVX2-NEXT:    vmovaps 544(%rdi), %xmm1
5587; AVX2-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
5588; AVX2-NEXT:    vmovaps 512(%rdi), %xmm2
5589; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5590; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5591; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5592; AVX2-NEXT:    vmovaps 736(%rdi), %xmm1
5593; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5594; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
5595; AVX2-NEXT:    vmovaps 704(%rdi), %xmm2
5596; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5597; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
5598; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5599; AVX2-NEXT:    vmovaps 672(%rdi), %xmm2
5600; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5601; AVX2-NEXT:    vmovaps 640(%rdi), %xmm3
5602; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5603; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5604; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5605; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5606; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5607; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5608; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5609; AVX2-NEXT:    vmovaps 224(%rdi), %xmm0
5610; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5611; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
5612; AVX2-NEXT:    vmovaps 192(%rdi), %xmm1
5613; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5614; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
5615; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5616; AVX2-NEXT:    vmovaps 160(%rdi), %xmm9
5617; AVX2-NEXT:    vmovaps 128(%rdi), %xmm8
5618; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
5619; AVX2-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5620; AVX2-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5621; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5622; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5623; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5624; AVX2-NEXT:    vmovaps 96(%rdi), %xmm7
5625; AVX2-NEXT:    vbroadcastss %xmm7, %xmm1
5626; AVX2-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5627; AVX2-NEXT:    vmovaps 64(%rdi), %xmm6
5628; AVX2-NEXT:    vbroadcastss %xmm6, %xmm2
5629; AVX2-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5630; AVX2-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5631; AVX2-NEXT:    vmovaps (%rdi), %xmm5
5632; AVX2-NEXT:    vmovaps 32(%rdi), %xmm4
5633; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5634; AVX2-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5635; AVX2-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5636; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
5637; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
5638; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5639; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
5640; AVX2-NEXT:    vmovaps %xmm10, %xmm3
5641; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5642; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3]
5643; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
5644; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5645; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5646; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5647; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
5648; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5649; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1]
5650; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5651; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
5652; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5653; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5654; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5655; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5656; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5657; AVX2-NEXT:    # xmm0 = mem[1,1,1,1]
5658; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5659; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
5660; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5661; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload
5662; AVX2-NEXT:    # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1]
5663; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5664; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
5665; AVX2-NEXT:    # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
5666; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5667; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5668; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1]
5669; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5670; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
5671; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5672; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5673; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5674; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5675; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
5676; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
5677; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
5678; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5679; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5680; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5681; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
5682; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5683; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1]
5684; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
5685; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5686; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5687; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5688; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5689; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5690; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
5691; AVX2-NEXT:    vmovaps (%rsp), %xmm7 # 16-byte Reload
5692; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
5693; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5694; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5695; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
5696; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5697; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5698; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5699; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
5700; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5701; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5702; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1]
5703; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5704; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3]
5705; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5706; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5707; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5708; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5709; AVX2-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3]
5710; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5711; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5712; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5713; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
5714; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5715; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
5716; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
5717; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5718; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5719; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5720; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5721; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2]
5722; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5723; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
5724; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5725; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5726; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5727; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5728; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5729; AVX2-NEXT:    # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
5730; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5731; AVX2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
5732; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5733; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5734; AVX2-NEXT:    # xmm15 = mem[2,2,2,2]
5735; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5736; AVX2-NEXT:    vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
5737; AVX2-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
5738; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm14
5739; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
5740; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5741; AVX2-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2]
5742; AVX2-NEXT:    vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3]
5743; AVX2-NEXT:    vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3]
5744; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
5745; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5746; AVX2-NEXT:    vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
5747; AVX2-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
5748; AVX2-NEXT:    vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2]
5749; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5750; AVX2-NEXT:    vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
5751; AVX2-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
5752; AVX2-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm15
5753; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
5754; AVX2-NEXT:    vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2]
5755; AVX2-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5756; AVX2-NEXT:    # xmm15 = mem[0,1,2],xmm15[3]
5757; AVX2-NEXT:    vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3]
5758; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
5759; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5760; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5761; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload
5762; AVX2-NEXT:    # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3]
5763; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5764; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2]
5765; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5766; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
5767; AVX2-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
5768; AVX2-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm11
5769; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
5770; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5771; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
5772; AVX2-NEXT:    # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3]
5773; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5774; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2]
5775; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5776; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
5777; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
5778; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
5779; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5780; AVX2-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
5781; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5782; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1]
5783; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5784; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
5785; AVX2-NEXT:    # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
5786; AVX2-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
5787; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5788; AVX2-NEXT:    # xmm3 = mem[2,3,2,3]
5789; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
5790; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
5791; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5792; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5793; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5794; AVX2-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
5795; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5796; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
5797; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
5798; AVX2-NEXT:    # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
5799; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5800; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5801; AVX2-NEXT:    # xmm1 = mem[2,3,2,3]
5802; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5803; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5804; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5805; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5806; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5807; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5808; AVX2-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
5809; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1]
5810; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
5811; AVX2-NEXT:    # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
5812; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5813; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
5814; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
5815; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5816; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5817; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5818; AVX2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3]
5819; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5820; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3]
5821; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5822; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5823; AVX2-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
5824; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1]
5825; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5826; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5827; AVX2-NEXT:    vmovaps 288(%rdi), %ymm0
5828; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5829; AVX2-NEXT:    vmovaps 256(%rdi), %ymm1
5830; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5831; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5832; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5833; AVX2-NEXT:    vmovaps 352(%rdi), %ymm1
5834; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5835; AVX2-NEXT:    vmovaps 320(%rdi), %ymm2
5836; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5837; AVX2-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5838; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2]
5839; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5840; AVX2-NEXT:    vmovaps 480(%rdi), %ymm2
5841; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5842; AVX2-NEXT:    vmovaps 448(%rdi), %ymm3
5843; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5844; AVX2-NEXT:    vmovaps 416(%rdi), %ymm8
5845; AVX2-NEXT:    vmovaps 384(%rdi), %ymm1
5846; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5847; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5]
5848; AVX2-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5849; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
5850; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5851; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5852; AVX2-NEXT:    vmovaps 800(%rdi), %ymm0
5853; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5854; AVX2-NEXT:    vmovaps 768(%rdi), %ymm1
5855; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5856; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5857; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5858; AVX2-NEXT:    vmovaps 864(%rdi), %ymm1
5859; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5860; AVX2-NEXT:    vmovaps 832(%rdi), %ymm2
5861; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5862; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5863; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
5864; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
5865; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5866; AVX2-NEXT:    vmovaps 992(%rdi), %ymm2
5867; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5868; AVX2-NEXT:    vmovaps 960(%rdi), %ymm3
5869; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5870; AVX2-NEXT:    vmovaps 928(%rdi), %ymm7
5871; AVX2-NEXT:    vmovaps 896(%rdi), %ymm1
5872; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5873; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5]
5874; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5875; AVX2-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5876; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
5877; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5878; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5879; AVX2-NEXT:    vmovaps 32(%rdi), %ymm0
5880; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5881; AVX2-NEXT:    vmovaps (%rdi), %ymm1
5882; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5883; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5884; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5885; AVX2-NEXT:    vmovaps 96(%rdi), %ymm1
5886; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5887; AVX2-NEXT:    vmovaps 64(%rdi), %ymm2
5888; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5889; AVX2-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5890; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
5891; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5892; AVX2-NEXT:    vmovaps 224(%rdi), %ymm2
5893; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5894; AVX2-NEXT:    vmovaps 192(%rdi), %ymm3
5895; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5896; AVX2-NEXT:    vmovaps 160(%rdi), %ymm15
5897; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
5898; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5899; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
5900; AVX2-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5901; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
5902; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5903; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5904; AVX2-NEXT:    vmovaps 544(%rdi), %ymm14
5905; AVX2-NEXT:    vmovaps 512(%rdi), %ymm11
5906; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5]
5907; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5908; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
5909; AVX2-NEXT:    vmovaps 608(%rdi), %ymm1
5910; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5911; AVX2-NEXT:    vmovaps 576(%rdi), %ymm2
5912; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5913; AVX2-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5914; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2]
5915; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3]
5916; AVX2-NEXT:    vmovaps 736(%rdi), %ymm2
5917; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5918; AVX2-NEXT:    vmovaps 704(%rdi), %ymm3
5919; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5920; AVX2-NEXT:    vmovaps 672(%rdi), %ymm5
5921; AVX2-NEXT:    vmovaps 640(%rdi), %ymm0
5922; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5923; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5]
5924; AVX2-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5925; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
5926; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5927; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5928; AVX2-NEXT:    vbroadcastss 404(%rdi), %ymm0
5929; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
5930; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
5931; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm1
5932; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5933; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5]
5934; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5935; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7]
5936; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm10
5937; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
5938; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5939; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5940; AVX2-NEXT:    vbroadcastss 916(%rdi), %ymm0
5941; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
5942; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
5943; AVX2-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
5944; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
5945; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5946; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5]
5947; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5948; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7]
5949; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
5950; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
5951; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5952; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
5953; AVX2-NEXT:    vbroadcastss 148(%rdi), %ymm0
5954; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
5955; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
5956; AVX2-NEXT:    vextractf128 $1, %ymm13, %xmm1
5957; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5958; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
5959; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5960; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
5961; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
5962; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5963; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5964; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5965; AVX2-NEXT:    vbroadcastss 660(%rdi), %ymm0
5966; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
5967; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
5968; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm1
5969; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
5970; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
5971; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
5972; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5973; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5974; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5975; AVX2-NEXT:    vbroadcastss 504(%rdi), %ymm0
5976; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
5977; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
5978; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5979; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
5980; AVX2-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7]
5981; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5982; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5983; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
5984; AVX2-NEXT:    # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
5985; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm2
5986; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2]
5987; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
5988; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5989; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5990; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5991; AVX2-NEXT:    vbroadcastss 1016(%rdi), %ymm1
5992; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5993; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7]
5994; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5995; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
5996; AVX2-NEXT:    # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
5997; AVX2-NEXT:    vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
5998; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5999; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
6000; AVX2-NEXT:    # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6001; AVX2-NEXT:    vextractf128 $1, %ymm12, %xmm2
6002; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2]
6003; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
6004; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6005; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6006; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6007; AVX2-NEXT:    vbroadcastss 248(%rdi), %ymm2
6008; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6009; AVX2-NEXT:    # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
6010; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6011; AVX2-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
6012; AVX2-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
6013; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6014; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
6015; AVX2-NEXT:    # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6016; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm8
6017; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
6018; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3]
6019; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
6020; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
6021; AVX2-NEXT:    vbroadcastss 760(%rdi), %ymm2
6022; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6023; AVX2-NEXT:    # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
6024; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6025; AVX2-NEXT:    vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
6026; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6027; AVX2-NEXT:    # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
6028; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6029; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
6030; AVX2-NEXT:    # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6031; AVX2-NEXT:    vextractf128 $1, %ymm11, %xmm15
6032; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
6033; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
6034; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
6035; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6036; AVX2-NEXT:    vbroadcastss 220(%rdi), %ymm1
6037; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6038; AVX2-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6039; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
6040; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm4
6041; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
6042; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm9
6043; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
6044; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7]
6045; AVX2-NEXT:    vbroadcastss 476(%rdi), %ymm1
6046; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6047; AVX2-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6048; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6049; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm1
6050; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
6051; AVX2-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
6052; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
6053; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
6054; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6055; AVX2-NEXT:    vbroadcastss 732(%rdi), %ymm1
6056; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6057; AVX2-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6058; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3]
6059; AVX2-NEXT:    vextractf128 $1, %ymm13, %xmm5
6060; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7]
6061; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
6062; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6063; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
6064; AVX2-NEXT:    vbroadcastss 988(%rdi), %ymm1
6065; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6066; AVX2-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6067; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
6068; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm3
6069; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7]
6070; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
6071; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
6072; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6073; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6074; AVX2-NEXT:    vmovaps %ymm3, 64(%rsi)
6075; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6076; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
6077; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6078; AVX2-NEXT:    vmovaps %ymm3, 96(%rsi)
6079; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6080; AVX2-NEXT:    vmovaps %ymm3, 32(%rsi)
6081; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6082; AVX2-NEXT:    vmovaps %ymm3, 64(%rdx)
6083; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6084; AVX2-NEXT:    vmovaps %ymm3, (%rdx)
6085; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6086; AVX2-NEXT:    vmovaps %ymm3, 96(%rdx)
6087; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6088; AVX2-NEXT:    vmovaps %ymm3, 32(%rdx)
6089; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6090; AVX2-NEXT:    vmovaps %ymm3, 64(%rcx)
6091; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6092; AVX2-NEXT:    vmovaps %ymm3, (%rcx)
6093; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6094; AVX2-NEXT:    vmovaps %ymm3, 96(%rcx)
6095; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6096; AVX2-NEXT:    vmovaps %ymm3, 32(%rcx)
6097; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6098; AVX2-NEXT:    vmovaps %ymm3, 64(%r8)
6099; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6100; AVX2-NEXT:    vmovaps %ymm3, (%r8)
6101; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6102; AVX2-NEXT:    vmovaps %ymm3, 96(%r8)
6103; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6104; AVX2-NEXT:    vmovaps %ymm3, 32(%r8)
6105; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6106; AVX2-NEXT:    vmovaps %ymm3, 64(%r9)
6107; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6108; AVX2-NEXT:    vmovaps %ymm3, (%r9)
6109; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6110; AVX2-NEXT:    vmovaps %ymm3, 96(%r9)
6111; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6112; AVX2-NEXT:    vmovaps %ymm3, 32(%r9)
6113; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6114; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6115; AVX2-NEXT:    vmovaps %ymm3, 64(%rax)
6116; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6117; AVX2-NEXT:    vmovaps %ymm3, (%rax)
6118; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
6119; AVX2-NEXT:    vmovaps %ymm3, 96(%rax)
6120; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6121; AVX2-NEXT:    vmovaps %ymm3, 32(%rax)
6122; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6123; AVX2-NEXT:    vmovaps %ymm2, 64(%rax)
6124; AVX2-NEXT:    vmovaps %ymm8, (%rax)
6125; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6126; AVX2-NEXT:    vmovaps %ymm2, 96(%rax)
6127; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6128; AVX2-NEXT:    vmovaps %ymm2, 32(%rax)
6129; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6130; AVX2-NEXT:    vmovaps %ymm1, 96(%rax)
6131; AVX2-NEXT:    vmovaps %ymm5, 64(%rax)
6132; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
6133; AVX2-NEXT:    vmovaps %ymm4, (%rax)
6134; AVX2-NEXT:    addq $1544, %rsp # imm = 0x608
6135; AVX2-NEXT:    vzeroupper
6136; AVX2-NEXT:    retq
6137;
6138; AVX2-FP-LABEL: load_i32_stride8_vf32:
6139; AVX2-FP:       # %bb.0:
6140; AVX2-FP-NEXT:    subq $1544, %rsp # imm = 0x608
6141; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm0
6142; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6143; AVX2-FP-NEXT:    vmovaps 256(%rdi), %xmm10
6144; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
6145; AVX2-FP-NEXT:    vmovaps 352(%rdi), %xmm14
6146; AVX2-FP-NEXT:    vbroadcastss %xmm14, %xmm1
6147; AVX2-FP-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6148; AVX2-FP-NEXT:    vmovaps 320(%rdi), %xmm15
6149; AVX2-FP-NEXT:    vbroadcastss %xmm15, %xmm2
6150; AVX2-FP-NEXT:    vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6151; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6152; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6153; AVX2-FP-NEXT:    vmovaps 416(%rdi), %xmm1
6154; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6155; AVX2-FP-NEXT:    vmovaps 384(%rdi), %xmm12
6156; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
6157; AVX2-FP-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6158; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6159; AVX2-FP-NEXT:    vmovaps 480(%rdi), %xmm2
6160; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6161; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
6162; AVX2-FP-NEXT:    vmovaps 448(%rdi), %xmm3
6163; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6164; AVX2-FP-NEXT:    vbroadcastss %xmm3, %xmm3
6165; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6166; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6167; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6168; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6169; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6170; AVX2-FP-NEXT:    vmovaps 800(%rdi), %xmm1
6171; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6172; AVX2-FP-NEXT:    vmovaps 768(%rdi), %xmm0
6173; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6174; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
6175; AVX2-FP-NEXT:    vmovaps 864(%rdi), %xmm1
6176; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6177; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
6178; AVX2-FP-NEXT:    vmovaps 832(%rdi), %xmm2
6179; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6180; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
6181; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6182; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6183; AVX2-FP-NEXT:    vmovaps 992(%rdi), %xmm1
6184; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6185; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
6186; AVX2-FP-NEXT:    vmovaps 960(%rdi), %xmm11
6187; AVX2-FP-NEXT:    vbroadcastss %xmm11, %xmm2
6188; AVX2-FP-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6189; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6190; AVX2-FP-NEXT:    vmovaps 928(%rdi), %xmm2
6191; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6192; AVX2-FP-NEXT:    vmovaps 896(%rdi), %xmm3
6193; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6194; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6195; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6196; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6197; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6198; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6199; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6200; AVX2-FP-NEXT:    vmovaps 608(%rdi), %xmm0
6201; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6202; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm0
6203; AVX2-FP-NEXT:    vmovaps 576(%rdi), %xmm1
6204; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6205; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
6206; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6207; AVX2-FP-NEXT:    vmovaps 544(%rdi), %xmm1
6208; AVX2-FP-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
6209; AVX2-FP-NEXT:    vmovaps 512(%rdi), %xmm2
6210; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6211; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6212; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
6213; AVX2-FP-NEXT:    vmovaps 736(%rdi), %xmm1
6214; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6215; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
6216; AVX2-FP-NEXT:    vmovaps 704(%rdi), %xmm2
6217; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6218; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
6219; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6220; AVX2-FP-NEXT:    vmovaps 672(%rdi), %xmm2
6221; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6222; AVX2-FP-NEXT:    vmovaps 640(%rdi), %xmm3
6223; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6224; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6225; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6226; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6227; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6228; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6229; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6230; AVX2-FP-NEXT:    vmovaps 224(%rdi), %xmm0
6231; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6232; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm0
6233; AVX2-FP-NEXT:    vmovaps 192(%rdi), %xmm1
6234; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6235; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
6236; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6237; AVX2-FP-NEXT:    vmovaps 160(%rdi), %xmm9
6238; AVX2-FP-NEXT:    vmovaps 128(%rdi), %xmm8
6239; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
6240; AVX2-FP-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6241; AVX2-FP-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6242; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6243; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6244; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6245; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm7
6246; AVX2-FP-NEXT:    vbroadcastss %xmm7, %xmm1
6247; AVX2-FP-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6248; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm6
6249; AVX2-FP-NEXT:    vbroadcastss %xmm6, %xmm2
6250; AVX2-FP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6251; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6252; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm5
6253; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm4
6254; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6255; AVX2-FP-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6256; AVX2-FP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6257; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
6258; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
6259; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6260; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
6261; AVX2-FP-NEXT:    vmovaps %xmm10, %xmm3
6262; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6263; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3]
6264; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
6265; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6266; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6267; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6268; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
6269; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6270; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1]
6271; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6272; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
6273; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6274; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6275; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6276; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6277; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6278; AVX2-FP-NEXT:    # xmm0 = mem[1,1,1,1]
6279; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6280; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
6281; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6282; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload
6283; AVX2-FP-NEXT:    # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1]
6284; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6285; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
6286; AVX2-FP-NEXT:    # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
6287; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6288; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6289; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1]
6290; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6291; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
6292; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6293; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6294; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6295; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6296; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
6297; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
6298; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
6299; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6300; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6301; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6302; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
6303; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6304; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1]
6305; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
6306; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6307; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6308; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6309; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6310; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6311; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
6312; AVX2-FP-NEXT:    vmovaps (%rsp), %xmm7 # 16-byte Reload
6313; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
6314; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6315; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6316; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
6317; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6318; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6319; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6320; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
6321; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6322; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6323; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1]
6324; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6325; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3]
6326; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6327; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6328; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6329; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6330; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3]
6331; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6332; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6333; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
6334; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
6335; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6336; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
6337; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
6338; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6339; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6340; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6341; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6342; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2]
6343; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6344; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6345; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6346; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6347; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6348; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6349; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6350; AVX2-FP-NEXT:    # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
6351; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6352; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6353; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6354; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6355; AVX2-FP-NEXT:    # xmm15 = mem[2,2,2,2]
6356; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6357; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
6358; AVX2-FP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
6359; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm14
6360; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
6361; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6362; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2]
6363; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3]
6364; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3]
6365; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
6366; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6367; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
6368; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
6369; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2]
6370; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6371; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
6372; AVX2-FP-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
6373; AVX2-FP-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm15
6374; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
6375; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2]
6376; AVX2-FP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6377; AVX2-FP-NEXT:    # xmm15 = mem[0,1,2],xmm15[3]
6378; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3]
6379; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
6380; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6381; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6382; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload
6383; AVX2-FP-NEXT:    # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3]
6384; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6385; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2]
6386; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6387; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
6388; AVX2-FP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
6389; AVX2-FP-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm11
6390; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
6391; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6392; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
6393; AVX2-FP-NEXT:    # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3]
6394; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6395; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2]
6396; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6397; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
6398; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6399; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
6400; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6401; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
6402; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6403; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1]
6404; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6405; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
6406; AVX2-FP-NEXT:    # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
6407; AVX2-FP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
6408; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
6409; AVX2-FP-NEXT:    # xmm3 = mem[2,3,2,3]
6410; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
6411; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
6412; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6413; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6414; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6415; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
6416; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6417; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
6418; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
6419; AVX2-FP-NEXT:    # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
6420; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6421; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6422; AVX2-FP-NEXT:    # xmm1 = mem[2,3,2,3]
6423; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6424; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6425; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6426; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6427; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6428; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6429; AVX2-FP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
6430; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1]
6431; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
6432; AVX2-FP-NEXT:    # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
6433; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6434; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
6435; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6436; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6437; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6438; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6439; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3]
6440; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6441; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3]
6442; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6443; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6444; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
6445; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1]
6446; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6447; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6448; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm0
6449; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6450; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm1
6451; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6452; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6453; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
6454; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm1
6455; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6456; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm2
6457; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6458; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6459; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2]
6460; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6461; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm2
6462; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6463; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm3
6464; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6465; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm8
6466; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm1
6467; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6468; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5]
6469; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6470; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
6471; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6472; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6473; AVX2-FP-NEXT:    vmovaps 800(%rdi), %ymm0
6474; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6475; AVX2-FP-NEXT:    vmovaps 768(%rdi), %ymm1
6476; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6477; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6478; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
6479; AVX2-FP-NEXT:    vmovaps 864(%rdi), %ymm1
6480; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6481; AVX2-FP-NEXT:    vmovaps 832(%rdi), %ymm2
6482; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6483; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6484; AVX2-FP-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
6485; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
6486; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6487; AVX2-FP-NEXT:    vmovaps 992(%rdi), %ymm2
6488; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6489; AVX2-FP-NEXT:    vmovaps 960(%rdi), %ymm3
6490; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6491; AVX2-FP-NEXT:    vmovaps 928(%rdi), %ymm7
6492; AVX2-FP-NEXT:    vmovaps 896(%rdi), %ymm1
6493; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6494; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5]
6495; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6496; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6497; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
6498; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6499; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6500; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm0
6501; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6502; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
6503; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6504; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6505; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
6506; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm1
6507; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6508; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm2
6509; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6510; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6511; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
6512; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6513; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm2
6514; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6515; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm3
6516; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6517; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm15
6518; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
6519; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6520; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
6521; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6522; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
6523; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6524; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6525; AVX2-FP-NEXT:    vmovaps 544(%rdi), %ymm14
6526; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm11
6527; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5]
6528; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6529; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
6530; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm1
6531; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6532; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm2
6533; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6534; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6535; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2]
6536; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3]
6537; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm2
6538; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6539; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm3
6540; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6541; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm5
6542; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm0
6543; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6544; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5]
6545; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6546; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
6547; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6548; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6549; AVX2-FP-NEXT:    vbroadcastss 404(%rdi), %ymm0
6550; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
6551; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
6552; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm1
6553; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6554; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5]
6555; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6556; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7]
6557; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm10
6558; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
6559; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6560; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6561; AVX2-FP-NEXT:    vbroadcastss 916(%rdi), %ymm0
6562; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
6563; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
6564; AVX2-FP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
6565; AVX2-FP-NEXT:    vextractf128 $1, %ymm1, %xmm1
6566; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6567; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5]
6568; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6569; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7]
6570; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
6571; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
6572; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6573; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
6574; AVX2-FP-NEXT:    vbroadcastss 148(%rdi), %ymm0
6575; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
6576; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6577; AVX2-FP-NEXT:    vextractf128 $1, %ymm13, %xmm1
6578; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6579; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
6580; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6581; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
6582; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
6583; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
6584; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6585; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6586; AVX2-FP-NEXT:    vbroadcastss 660(%rdi), %ymm0
6587; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
6588; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6589; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm1
6590; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
6591; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
6592; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
6593; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6594; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6595; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6596; AVX2-FP-NEXT:    vbroadcastss 504(%rdi), %ymm0
6597; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
6598; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
6599; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6600; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
6601; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7]
6602; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6603; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6604; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
6605; AVX2-FP-NEXT:    # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6606; AVX2-FP-NEXT:    vextractf128 $1, %ymm3, %xmm2
6607; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2]
6608; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
6609; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6610; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6611; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6612; AVX2-FP-NEXT:    vbroadcastss 1016(%rdi), %ymm1
6613; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6614; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7]
6615; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6616; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
6617; AVX2-FP-NEXT:    # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6618; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
6619; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6620; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
6621; AVX2-FP-NEXT:    # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6622; AVX2-FP-NEXT:    vextractf128 $1, %ymm12, %xmm2
6623; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2]
6624; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
6625; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6626; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6627; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6628; AVX2-FP-NEXT:    vbroadcastss 248(%rdi), %ymm2
6629; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6630; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
6631; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6632; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
6633; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
6634; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6635; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
6636; AVX2-FP-NEXT:    # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6637; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm8
6638; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
6639; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3]
6640; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
6641; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
6642; AVX2-FP-NEXT:    vbroadcastss 760(%rdi), %ymm2
6643; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6644; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
6645; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6646; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
6647; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6648; AVX2-FP-NEXT:    # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
6649; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6650; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
6651; AVX2-FP-NEXT:    # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6652; AVX2-FP-NEXT:    vextractf128 $1, %ymm11, %xmm15
6653; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
6654; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
6655; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
6656; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6657; AVX2-FP-NEXT:    vbroadcastss 220(%rdi), %ymm1
6658; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6659; AVX2-FP-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6660; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
6661; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm4
6662; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
6663; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm9
6664; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
6665; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7]
6666; AVX2-FP-NEXT:    vbroadcastss 476(%rdi), %ymm1
6667; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6668; AVX2-FP-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6669; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6670; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm1
6671; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
6672; AVX2-FP-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
6673; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
6674; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
6675; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6676; AVX2-FP-NEXT:    vbroadcastss 732(%rdi), %ymm1
6677; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6678; AVX2-FP-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6679; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3]
6680; AVX2-FP-NEXT:    vextractf128 $1, %ymm13, %xmm5
6681; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7]
6682; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
6683; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6684; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
6685; AVX2-FP-NEXT:    vbroadcastss 988(%rdi), %ymm1
6686; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6687; AVX2-FP-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6688; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
6689; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm3
6690; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7]
6691; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
6692; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
6693; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6694; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6695; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%rsi)
6696; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6697; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
6698; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6699; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%rsi)
6700; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6701; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rsi)
6702; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6703; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%rdx)
6704; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6705; AVX2-FP-NEXT:    vmovaps %ymm3, (%rdx)
6706; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6707; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%rdx)
6708; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6709; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rdx)
6710; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6711; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%rcx)
6712; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6713; AVX2-FP-NEXT:    vmovaps %ymm3, (%rcx)
6714; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6715; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%rcx)
6716; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6717; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rcx)
6718; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6719; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%r8)
6720; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6721; AVX2-FP-NEXT:    vmovaps %ymm3, (%r8)
6722; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6723; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%r8)
6724; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6725; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%r8)
6726; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6727; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%r9)
6728; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6729; AVX2-FP-NEXT:    vmovaps %ymm3, (%r9)
6730; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6731; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%r9)
6732; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6733; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%r9)
6734; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6735; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6736; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%rax)
6737; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6738; AVX2-FP-NEXT:    vmovaps %ymm3, (%rax)
6739; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
6740; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%rax)
6741; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6742; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rax)
6743; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6744; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rax)
6745; AVX2-FP-NEXT:    vmovaps %ymm8, (%rax)
6746; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6747; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rax)
6748; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6749; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rax)
6750; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6751; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rax)
6752; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rax)
6753; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
6754; AVX2-FP-NEXT:    vmovaps %ymm4, (%rax)
6755; AVX2-FP-NEXT:    addq $1544, %rsp # imm = 0x608
6756; AVX2-FP-NEXT:    vzeroupper
6757; AVX2-FP-NEXT:    retq
6758;
6759; AVX2-FCP-LABEL: load_i32_stride8_vf32:
6760; AVX2-FCP:       # %bb.0:
6761; AVX2-FCP-NEXT:    subq $1544, %rsp # imm = 0x608
6762; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %xmm0
6763; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6764; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %xmm10
6765; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
6766; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %xmm14
6767; AVX2-FCP-NEXT:    vbroadcastss %xmm14, %xmm1
6768; AVX2-FCP-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6769; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %xmm15
6770; AVX2-FCP-NEXT:    vbroadcastss %xmm15, %xmm2
6771; AVX2-FCP-NEXT:    vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6772; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6773; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6774; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %xmm1
6775; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6776; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %xmm12
6777; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
6778; AVX2-FCP-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6779; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6780; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %xmm2
6781; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6782; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
6783; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %xmm3
6784; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6785; AVX2-FCP-NEXT:    vbroadcastss %xmm3, %xmm3
6786; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6787; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6788; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6789; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6790; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6791; AVX2-FCP-NEXT:    vmovaps 800(%rdi), %xmm1
6792; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6793; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %xmm0
6794; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6795; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
6796; AVX2-FCP-NEXT:    vmovaps 864(%rdi), %xmm1
6797; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6798; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
6799; AVX2-FCP-NEXT:    vmovaps 832(%rdi), %xmm2
6800; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6801; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
6802; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6803; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6804; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %xmm1
6805; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6806; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
6807; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %xmm11
6808; AVX2-FCP-NEXT:    vbroadcastss %xmm11, %xmm2
6809; AVX2-FCP-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6810; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6811; AVX2-FCP-NEXT:    vmovaps 928(%rdi), %xmm2
6812; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6813; AVX2-FCP-NEXT:    vmovaps 896(%rdi), %xmm3
6814; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6815; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6816; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6817; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6818; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6819; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6820; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6821; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %xmm0
6822; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6823; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm0
6824; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %xmm1
6825; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6826; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
6827; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6828; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %xmm1
6829; AVX2-FCP-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
6830; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %xmm2
6831; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6832; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6833; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
6834; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %xmm1
6835; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6836; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
6837; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %xmm2
6838; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6839; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
6840; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6841; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %xmm2
6842; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6843; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %xmm3
6844; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6845; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6846; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6847; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6848; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6849; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6850; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6851; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %xmm0
6852; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6853; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm0
6854; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %xmm1
6855; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6856; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
6857; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6858; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %xmm9
6859; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %xmm8
6860; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
6861; AVX2-FCP-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6862; AVX2-FCP-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6863; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6864; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6865; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6866; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm7
6867; AVX2-FCP-NEXT:    vbroadcastss %xmm7, %xmm1
6868; AVX2-FCP-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6869; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm6
6870; AVX2-FCP-NEXT:    vbroadcastss %xmm6, %xmm2
6871; AVX2-FCP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6872; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6873; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm5
6874; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm4
6875; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6876; AVX2-FCP-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6877; AVX2-FCP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6878; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
6879; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
6880; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6881; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
6882; AVX2-FCP-NEXT:    vmovaps %xmm10, %xmm3
6883; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6884; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3]
6885; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
6886; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6887; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6888; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6889; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
6890; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6891; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1]
6892; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6893; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
6894; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6895; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6896; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6897; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6898; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6899; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1]
6900; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6901; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
6902; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6903; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload
6904; AVX2-FCP-NEXT:    # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1]
6905; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6906; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
6907; AVX2-FCP-NEXT:    # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
6908; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6909; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6910; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1]
6911; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6912; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
6913; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6914; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6915; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6916; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6917; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
6918; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
6919; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
6920; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6921; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6922; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6923; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
6924; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6925; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1]
6926; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
6927; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6928; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6929; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6930; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6931; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6932; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
6933; AVX2-FCP-NEXT:    vmovaps (%rsp), %xmm7 # 16-byte Reload
6934; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
6935; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6936; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6937; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
6938; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6939; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6940; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6941; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
6942; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6943; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6944; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1]
6945; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6946; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3]
6947; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
6948; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6949; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6950; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6951; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3]
6952; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6953; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6954; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
6955; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
6956; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6957; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
6958; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
6959; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6960; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6961; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6962; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6963; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2]
6964; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6965; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6966; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6967; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6968; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6969; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6970; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6971; AVX2-FCP-NEXT:    # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
6972; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6973; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6974; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6975; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6976; AVX2-FCP-NEXT:    # xmm15 = mem[2,2,2,2]
6977; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6978; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
6979; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
6980; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm14
6981; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
6982; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6983; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2]
6984; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3]
6985; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3]
6986; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
6987; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6988; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
6989; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
6990; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2]
6991; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6992; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
6993; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
6994; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm15
6995; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
6996; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2]
6997; AVX2-FCP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6998; AVX2-FCP-NEXT:    # xmm15 = mem[0,1,2],xmm15[3]
6999; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3]
7000; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
7001; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7002; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7003; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload
7004; AVX2-FCP-NEXT:    # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3]
7005; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7006; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2]
7007; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7008; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
7009; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
7010; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm11
7011; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
7012; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7013; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
7014; AVX2-FCP-NEXT:    # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3]
7015; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7016; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2]
7017; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7018; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
7019; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
7020; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
7021; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7022; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
7023; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7024; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1]
7025; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7026; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
7027; AVX2-FCP-NEXT:    # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
7028; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
7029; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7030; AVX2-FCP-NEXT:    # xmm3 = mem[2,3,2,3]
7031; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
7032; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
7033; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
7034; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7035; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7036; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7037; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7038; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
7039; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
7040; AVX2-FCP-NEXT:    # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
7041; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7042; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7043; AVX2-FCP-NEXT:    # xmm1 = mem[2,3,2,3]
7044; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
7045; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
7046; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7047; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7048; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7049; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7050; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7051; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1]
7052; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
7053; AVX2-FCP-NEXT:    # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
7054; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
7055; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
7056; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7057; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
7058; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7059; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7060; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3]
7061; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
7062; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3]
7063; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
7064; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
7065; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
7066; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1]
7067; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7068; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7069; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm0
7070; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7071; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm1
7072; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7073; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
7074; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
7075; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm1
7076; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7077; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm2
7078; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7079; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7080; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2]
7081; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7082; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm2
7083; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7084; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm3
7085; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7086; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm8
7087; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm1
7088; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7089; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5]
7090; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7091; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
7092; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7093; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7094; AVX2-FCP-NEXT:    vmovaps 800(%rdi), %ymm0
7095; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7096; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %ymm1
7097; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7098; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
7099; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
7100; AVX2-FCP-NEXT:    vmovaps 864(%rdi), %ymm1
7101; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7102; AVX2-FCP-NEXT:    vmovaps 832(%rdi), %ymm2
7103; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7104; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7105; AVX2-FCP-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
7106; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
7107; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7108; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %ymm2
7109; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7110; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %ymm3
7111; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7112; AVX2-FCP-NEXT:    vmovaps 928(%rdi), %ymm7
7113; AVX2-FCP-NEXT:    vmovaps 896(%rdi), %ymm1
7114; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7115; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5]
7116; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7117; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7118; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
7119; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7120; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7121; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm0
7122; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7123; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
7124; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7125; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
7126; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
7127; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
7128; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7129; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm2
7130; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7131; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7132; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
7133; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7134; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm2
7135; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7136; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm3
7137; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7138; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm15
7139; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm1
7140; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7141; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
7142; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7143; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
7144; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7145; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7146; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm14
7147; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm11
7148; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5]
7149; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7150; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
7151; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm1
7152; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7153; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm2
7154; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7155; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7156; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2]
7157; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3]
7158; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm2
7159; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7160; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm3
7161; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7162; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm5
7163; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm0
7164; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7165; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5]
7166; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7167; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
7168; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7169; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7170; AVX2-FCP-NEXT:    vbroadcastss 404(%rdi), %ymm0
7171; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
7172; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
7173; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm1
7174; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
7175; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5]
7176; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7177; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7]
7178; AVX2-FCP-NEXT:    vextractf128 $1, %ymm10, %xmm10
7179; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
7180; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7181; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7182; AVX2-FCP-NEXT:    vbroadcastss 916(%rdi), %ymm0
7183; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
7184; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
7185; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
7186; AVX2-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
7187; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7188; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5]
7189; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
7190; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7]
7191; AVX2-FCP-NEXT:    vextractf128 $1, %ymm7, %xmm7
7192; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
7193; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7194; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
7195; AVX2-FCP-NEXT:    vbroadcastss 148(%rdi), %ymm0
7196; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
7197; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
7198; AVX2-FCP-NEXT:    vextractf128 $1, %ymm13, %xmm1
7199; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7200; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
7201; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7202; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
7203; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm4
7204; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
7205; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7206; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7207; AVX2-FCP-NEXT:    vbroadcastss 660(%rdi), %ymm0
7208; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
7209; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
7210; AVX2-FCP-NEXT:    vextractf128 $1, %ymm9, %xmm1
7211; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
7212; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
7213; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
7214; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
7215; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7216; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7217; AVX2-FCP-NEXT:    vbroadcastss 504(%rdi), %ymm0
7218; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
7219; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
7220; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7221; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
7222; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7]
7223; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7224; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7225; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
7226; AVX2-FCP-NEXT:    # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
7227; AVX2-FCP-NEXT:    vextractf128 $1, %ymm3, %xmm2
7228; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2]
7229; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
7230; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7231; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7232; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7233; AVX2-FCP-NEXT:    vbroadcastss 1016(%rdi), %ymm1
7234; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7235; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7]
7236; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7237; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
7238; AVX2-FCP-NEXT:    # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
7239; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
7240; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7241; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
7242; AVX2-FCP-NEXT:    # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
7243; AVX2-FCP-NEXT:    vextractf128 $1, %ymm12, %xmm2
7244; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2]
7245; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
7246; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
7247; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7248; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7249; AVX2-FCP-NEXT:    vbroadcastss 248(%rdi), %ymm2
7250; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7251; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
7252; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7253; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
7254; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
7255; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7256; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
7257; AVX2-FCP-NEXT:    # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7258; AVX2-FCP-NEXT:    vextractf128 $1, %ymm9, %xmm8
7259; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
7260; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3]
7261; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
7262; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
7263; AVX2-FCP-NEXT:    vbroadcastss 760(%rdi), %ymm2
7264; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7265; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
7266; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7267; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
7268; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
7269; AVX2-FCP-NEXT:    # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
7270; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7271; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
7272; AVX2-FCP-NEXT:    # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7273; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm15
7274; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
7275; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
7276; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
7277; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7278; AVX2-FCP-NEXT:    vbroadcastss 220(%rdi), %ymm1
7279; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7280; AVX2-FCP-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7281; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
7282; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm4
7283; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
7284; AVX2-FCP-NEXT:    vextractf128 $1, %ymm9, %xmm9
7285; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
7286; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7]
7287; AVX2-FCP-NEXT:    vbroadcastss 476(%rdi), %ymm1
7288; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7289; AVX2-FCP-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7290; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
7291; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm1
7292; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
7293; AVX2-FCP-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
7294; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
7295; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
7296; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7297; AVX2-FCP-NEXT:    vbroadcastss 732(%rdi), %ymm1
7298; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7299; AVX2-FCP-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7300; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3]
7301; AVX2-FCP-NEXT:    vextractf128 $1, %ymm13, %xmm5
7302; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7]
7303; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
7304; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
7305; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
7306; AVX2-FCP-NEXT:    vbroadcastss 988(%rdi), %ymm1
7307; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7308; AVX2-FCP-NEXT:    # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7309; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
7310; AVX2-FCP-NEXT:    vextractf128 $1, %ymm10, %xmm3
7311; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7]
7312; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
7313; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
7314; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
7315; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7316; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%rsi)
7317; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7318; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rsi)
7319; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7320; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%rsi)
7321; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7322; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rsi)
7323; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7324; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%rdx)
7325; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7326; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rdx)
7327; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7328; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%rdx)
7329; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7330; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rdx)
7331; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7332; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%rcx)
7333; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7334; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rcx)
7335; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7336; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%rcx)
7337; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7338; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rcx)
7339; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7340; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%r8)
7341; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7342; AVX2-FCP-NEXT:    vmovaps %ymm3, (%r8)
7343; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7344; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%r8)
7345; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7346; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%r8)
7347; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7348; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%r9)
7349; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7350; AVX2-FCP-NEXT:    vmovaps %ymm3, (%r9)
7351; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7352; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%r9)
7353; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7354; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%r9)
7355; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7356; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7357; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%rax)
7358; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7359; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rax)
7360; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
7361; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%rax)
7362; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7363; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rax)
7364; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7365; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rax)
7366; AVX2-FCP-NEXT:    vmovaps %ymm8, (%rax)
7367; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7368; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rax)
7369; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7370; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rax)
7371; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7372; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%rax)
7373; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rax)
7374; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
7375; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rax)
7376; AVX2-FCP-NEXT:    addq $1544, %rsp # imm = 0x608
7377; AVX2-FCP-NEXT:    vzeroupper
7378; AVX2-FCP-NEXT:    retq
7379;
7380; AVX512-LABEL: load_i32_stride8_vf32:
7381; AVX512:       # %bb.0:
7382; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7383; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7384; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
7385; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
7386; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm29
7387; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm1
7388; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm30
7389; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm31
7390; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm3
7391; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm7
7392; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm6
7393; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm9
7394; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm5
7395; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm12
7396; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm2
7397; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm14
7398; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm11
7399; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm16
7400; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm15
7401; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
7402; AVX512-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7403; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm17
7404; AVX512-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
7405; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm18
7406; AVX512-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
7407; AVX512-NEXT:    movb $-64, %dil
7408; AVX512-NEXT:    kmovw %edi, %k1
7409; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
7410; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm10
7411; AVX512-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
7412; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm8
7413; AVX512-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
7414; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
7415; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
7416; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm8
7417; AVX512-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
7418; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
7419; AVX512-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
7420; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7421; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm8
7422; AVX512-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
7423; AVX512-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
7424; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7425; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
7426; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
7427; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7428; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm10
7429; AVX512-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
7430; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm13
7431; AVX512-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
7432; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
7433; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm10
7434; AVX512-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
7435; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm4
7436; AVX512-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
7437; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7438; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
7439; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm4
7440; AVX512-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
7441; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
7442; AVX512-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
7443; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
7444; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4
7445; AVX512-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
7446; AVX512-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
7447; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
7448; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
7449; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
7450; AVX512-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7451; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm8
7452; AVX512-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7453; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm10
7454; AVX512-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7455; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7456; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm8
7457; AVX512-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7458; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
7459; AVX512-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7460; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7461; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
7462; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm8
7463; AVX512-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7464; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
7465; AVX512-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7466; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7467; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm8
7468; AVX512-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7469; AVX512-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7470; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7471; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
7472; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
7473; AVX512-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7474; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm8
7475; AVX512-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7476; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm10
7477; AVX512-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7478; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7479; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm8
7480; AVX512-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7481; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
7482; AVX512-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7483; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7484; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
7485; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm8
7486; AVX512-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7487; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
7488; AVX512-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7489; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7490; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm8
7491; AVX512-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7492; AVX512-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7493; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7494; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
7495; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
7496; AVX512-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7497; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm8
7498; AVX512-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7499; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm10
7500; AVX512-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7501; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7502; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm8
7503; AVX512-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7504; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
7505; AVX512-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7506; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7507; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
7508; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm8
7509; AVX512-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7510; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
7511; AVX512-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7512; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7513; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm8
7514; AVX512-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7515; AVX512-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7516; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7517; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
7518; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
7519; AVX512-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7520; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm8
7521; AVX512-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7522; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm10
7523; AVX512-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7524; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7525; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm8
7526; AVX512-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7527; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
7528; AVX512-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7529; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7530; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
7531; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm8
7532; AVX512-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7533; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
7534; AVX512-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7535; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7536; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm8
7537; AVX512-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7538; AVX512-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7539; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7540; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
7541; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
7542; AVX512-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7543; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm8
7544; AVX512-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7545; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm10
7546; AVX512-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7547; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7548; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm8
7549; AVX512-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7550; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
7551; AVX512-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7552; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7553; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7554; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm10
7555; AVX512-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
7556; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm13
7557; AVX512-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
7558; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
7559; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm10
7560; AVX512-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
7561; AVX512-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7562; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7563; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
7564; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
7565; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7566; AVX512-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
7567; AVX512-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
7568; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
7569; AVX512-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
7570; AVX512-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
7571; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
7572; AVX512-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
7573; AVX512-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
7574; AVX512-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
7575; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
7576; AVX512-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
7577; AVX512-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
7578; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7579; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7580; AVX512-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
7581; AVX512-NEXT:    vmovdqa64 %zmm17, (%rsi)
7582; AVX512-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
7583; AVX512-NEXT:    vmovdqa64 %zmm19, (%rdx)
7584; AVX512-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
7585; AVX512-NEXT:    vmovdqa64 %zmm21, (%rcx)
7586; AVX512-NEXT:    vmovdqa64 %zmm22, 64(%r8)
7587; AVX512-NEXT:    vmovdqa64 %zmm23, (%r8)
7588; AVX512-NEXT:    vmovdqa64 %zmm24, 64(%r9)
7589; AVX512-NEXT:    vmovdqa64 %zmm25, (%r9)
7590; AVX512-NEXT:    vmovdqa64 %zmm26, 64(%r11)
7591; AVX512-NEXT:    vmovdqa64 %zmm27, (%r11)
7592; AVX512-NEXT:    vmovdqa64 %zmm8, 64(%r10)
7593; AVX512-NEXT:    vmovdqa64 %zmm4, (%r10)
7594; AVX512-NEXT:    vmovdqa64 %zmm2, 64(%rax)
7595; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
7596; AVX512-NEXT:    vzeroupper
7597; AVX512-NEXT:    retq
7598;
7599; AVX512-FCP-LABEL: load_i32_stride8_vf32:
7600; AVX512-FCP:       # %bb.0:
7601; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7602; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7603; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
7604; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
7605; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm29
7606; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
7607; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
7608; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm31
7609; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm3
7610; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
7611; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
7612; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm9
7613; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm5
7614; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm12
7615; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm2
7616; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm14
7617; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
7618; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm16
7619; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm15
7620; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
7621; AVX512-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7622; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
7623; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
7624; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18
7625; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
7626; AVX512-FCP-NEXT:    movb $-64, %dil
7627; AVX512-FCP-NEXT:    kmovw %edi, %k1
7628; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
7629; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
7630; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
7631; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm8
7632; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
7633; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
7634; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
7635; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
7636; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
7637; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
7638; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
7639; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7640; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
7641; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
7642; AVX512-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
7643; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7644; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
7645; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
7646; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7647; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10
7648; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
7649; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm13
7650; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
7651; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
7652; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
7653; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
7654; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm4
7655; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
7656; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7657; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
7658; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
7659; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
7660; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
7661; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
7662; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
7663; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
7664; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
7665; AVX512-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
7666; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
7667; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
7668; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
7669; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7670; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
7671; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7672; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
7673; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7674; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7675; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
7676; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7677; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
7678; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7679; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7680; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
7681; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
7682; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7683; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
7684; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7685; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7686; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
7687; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7688; AVX512-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7689; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7690; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
7691; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
7692; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7693; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
7694; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7695; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
7696; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7697; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7698; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
7699; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7700; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
7701; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7702; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7703; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
7704; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
7705; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7706; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
7707; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7708; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7709; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
7710; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7711; AVX512-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7712; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7713; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
7714; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
7715; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7716; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
7717; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7718; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
7719; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7720; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7721; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
7722; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7723; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
7724; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7725; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7726; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
7727; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
7728; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7729; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
7730; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7731; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7732; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
7733; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7734; AVX512-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7735; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7736; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
7737; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
7738; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7739; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
7740; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7741; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
7742; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7743; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7744; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
7745; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7746; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
7747; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7748; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7749; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
7750; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
7751; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7752; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
7753; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7754; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7755; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
7756; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7757; AVX512-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7758; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7759; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
7760; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
7761; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7762; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
7763; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7764; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
7765; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7766; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7767; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
7768; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7769; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
7770; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7771; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7772; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7773; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
7774; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
7775; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
7776; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
7777; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
7778; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
7779; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
7780; AVX512-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7781; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7782; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
7783; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
7784; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7785; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
7786; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
7787; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
7788; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
7789; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
7790; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
7791; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
7792; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
7793; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
7794; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
7795; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
7796; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
7797; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7798; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7799; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
7800; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%rsi)
7801; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
7802; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rdx)
7803; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
7804; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
7805; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r8)
7806; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
7807; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, 64(%r9)
7808; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, (%r9)
7809; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r11)
7810; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, (%r11)
7811; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r10)
7812; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
7813; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
7814; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
7815; AVX512-FCP-NEXT:    vzeroupper
7816; AVX512-FCP-NEXT:    retq
7817;
7818; AVX512DQ-LABEL: load_i32_stride8_vf32:
7819; AVX512DQ:       # %bb.0:
7820; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7821; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7822; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r11
7823; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
7824; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm29
7825; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm1
7826; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm30
7827; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm31
7828; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm3
7829; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm7
7830; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm6
7831; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm9
7832; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm5
7833; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm12
7834; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm2
7835; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm14
7836; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm11
7837; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm16
7838; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm15
7839; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
7840; AVX512DQ-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7841; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm17
7842; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
7843; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm18
7844; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
7845; AVX512DQ-NEXT:    movb $-64, %dil
7846; AVX512DQ-NEXT:    kmovw %edi, %k1
7847; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
7848; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm10
7849; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
7850; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm8
7851; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
7852; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
7853; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
7854; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm8
7855; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
7856; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
7857; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
7858; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7859; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm8
7860; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
7861; AVX512DQ-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
7862; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7863; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
7864; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
7865; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7866; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm10
7867; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
7868; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm13
7869; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
7870; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
7871; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm10
7872; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
7873; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm4
7874; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
7875; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7876; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
7877; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm4
7878; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
7879; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
7880; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
7881; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
7882; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4
7883; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
7884; AVX512DQ-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
7885; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
7886; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
7887; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
7888; AVX512DQ-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7889; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm8
7890; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7891; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm10
7892; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7893; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7894; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm8
7895; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7896; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
7897; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7898; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7899; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
7900; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm8
7901; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7902; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
7903; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7904; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7905; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm8
7906; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7907; AVX512DQ-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7908; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7909; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
7910; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
7911; AVX512DQ-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7912; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm8
7913; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7914; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm10
7915; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7916; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7917; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm8
7918; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7919; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
7920; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7921; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7922; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
7923; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm8
7924; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7925; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
7926; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7927; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7928; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm8
7929; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7930; AVX512DQ-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7931; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7932; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
7933; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
7934; AVX512DQ-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7935; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm8
7936; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7937; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm10
7938; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7939; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7940; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm8
7941; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7942; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
7943; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7944; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7945; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
7946; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm8
7947; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7948; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
7949; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7950; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7951; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm8
7952; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7953; AVX512DQ-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7954; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7955; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
7956; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
7957; AVX512DQ-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7958; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm8
7959; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7960; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm10
7961; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7962; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7963; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm8
7964; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7965; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
7966; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7967; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7968; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
7969; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm8
7970; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
7971; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
7972; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
7973; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7974; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm8
7975; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
7976; AVX512DQ-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
7977; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7978; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
7979; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
7980; AVX512DQ-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7981; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm8
7982; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
7983; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm10
7984; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
7985; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
7986; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm8
7987; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
7988; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
7989; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
7990; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7991; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7992; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm10
7993; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
7994; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm13
7995; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
7996; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
7997; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm10
7998; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
7999; AVX512DQ-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8000; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8001; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8002; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8003; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8004; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
8005; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
8006; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
8007; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
8008; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
8009; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8010; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8011; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
8012; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
8013; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
8014; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
8015; AVX512DQ-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
8016; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8017; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8018; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
8019; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rsi)
8020; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
8021; AVX512DQ-NEXT:    vmovdqa64 %zmm19, (%rdx)
8022; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
8023; AVX512DQ-NEXT:    vmovdqa64 %zmm21, (%rcx)
8024; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 64(%r8)
8025; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%r8)
8026; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 64(%r9)
8027; AVX512DQ-NEXT:    vmovdqa64 %zmm25, (%r9)
8028; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 64(%r11)
8029; AVX512DQ-NEXT:    vmovdqa64 %zmm27, (%r11)
8030; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 64(%r10)
8031; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%r10)
8032; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rax)
8033; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
8034; AVX512DQ-NEXT:    vzeroupper
8035; AVX512DQ-NEXT:    retq
8036;
8037; AVX512DQ-FCP-LABEL: load_i32_stride8_vf32:
8038; AVX512DQ-FCP:       # %bb.0:
8039; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8040; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
8041; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
8042; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
8043; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm29
8044; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
8045; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
8046; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm31
8047; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm3
8048; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
8049; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
8050; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm9
8051; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm5
8052; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm12
8053; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm2
8054; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm14
8055; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
8056; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm16
8057; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm15
8058; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8059; AVX512DQ-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8060; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
8061; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
8062; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18
8063; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
8064; AVX512DQ-FCP-NEXT:    movb $-64, %dil
8065; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
8066; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
8067; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
8068; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
8069; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm8
8070; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
8071; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8072; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8073; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8074; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
8075; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8076; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
8077; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8078; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8079; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
8080; AVX512DQ-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
8081; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8082; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8083; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8084; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8085; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10
8086; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
8087; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm13
8088; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
8089; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8090; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
8091; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
8092; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm4
8093; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
8094; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8095; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8096; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
8097; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
8098; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8099; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
8100; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
8101; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
8102; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
8103; AVX512DQ-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
8104; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8105; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8106; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8107; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8108; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8109; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8110; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8111; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8112; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8113; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8114; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8115; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8116; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8117; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8118; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8119; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8120; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8121; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8122; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8123; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8124; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8125; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8126; AVX512DQ-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8127; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8128; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
8129; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
8130; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8131; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8132; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8133; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8134; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8135; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8136; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8137; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8138; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8139; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8140; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8141; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
8142; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8143; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8144; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8145; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8146; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8147; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8148; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8149; AVX512DQ-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8150; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8151; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
8152; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
8153; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8154; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8155; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8156; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8157; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8158; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8159; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8160; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8161; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8162; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8163; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8164; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8165; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8166; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8167; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8168; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8169; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8170; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8171; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8172; AVX512DQ-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8173; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8174; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8175; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8176; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8177; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8178; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8179; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8180; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8181; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8182; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8183; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8184; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8185; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8186; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8187; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8188; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8189; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8190; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8191; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8192; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8193; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8194; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8195; AVX512DQ-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8196; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8197; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8198; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8199; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8200; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8201; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8202; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8203; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8204; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8205; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8206; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8207; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8208; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8209; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8210; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8211; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
8212; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
8213; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
8214; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
8215; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8216; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
8217; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
8218; AVX512DQ-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8219; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8220; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8221; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8222; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8223; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
8224; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
8225; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
8226; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
8227; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
8228; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8229; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8230; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
8231; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
8232; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
8233; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
8234; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
8235; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8236; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8237; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
8238; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%rsi)
8239; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
8240; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rdx)
8241; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
8242; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
8243; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r8)
8244; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
8245; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, 64(%r9)
8246; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, (%r9)
8247; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r11)
8248; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, (%r11)
8249; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r10)
8250; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
8251; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
8252; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
8253; AVX512DQ-FCP-NEXT:    vzeroupper
8254; AVX512DQ-FCP-NEXT:    retq
8255;
8256; AVX512BW-LABEL: load_i32_stride8_vf32:
8257; AVX512BW:       # %bb.0:
8258; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8259; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
8260; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
8261; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
8262; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm29
8263; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
8264; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm30
8265; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm31
8266; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
8267; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
8268; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
8269; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm9
8270; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm5
8271; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm12
8272; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm2
8273; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm14
8274; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
8275; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm16
8276; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm15
8277; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8278; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8279; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17
8280; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
8281; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm18
8282; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
8283; AVX512BW-NEXT:    movb $-64, %dil
8284; AVX512BW-NEXT:    kmovd %edi, %k1
8285; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
8286; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
8287; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
8288; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
8289; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
8290; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8291; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8292; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8293; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
8294; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8295; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
8296; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8297; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8298; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
8299; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
8300; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8301; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8302; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8303; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8304; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm10
8305; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
8306; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
8307; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
8308; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8309; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
8310; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
8311; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
8312; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
8313; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8314; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8315; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
8316; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
8317; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8318; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
8319; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
8320; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
8321; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
8322; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
8323; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8324; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8325; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8326; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8327; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8328; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8329; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8330; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8331; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8332; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8333; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8334; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8335; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8336; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8337; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8338; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8339; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8340; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8341; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8342; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8343; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8344; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8345; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8346; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8347; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
8348; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
8349; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8350; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8351; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8352; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8353; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8354; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8355; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8356; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8357; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8358; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8359; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8360; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
8361; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8362; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8363; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8364; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8365; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8366; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8367; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8368; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8369; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8370; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
8371; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
8372; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8373; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8374; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8375; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8376; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8377; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8378; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8379; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8380; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8381; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8382; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8383; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8384; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8385; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8386; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8387; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8388; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8389; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8390; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8391; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8392; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8393; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8394; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8395; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8396; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8397; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8398; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8399; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8400; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8401; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8402; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8403; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8404; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8405; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8406; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8407; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8408; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8409; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8410; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8411; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8412; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8413; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8414; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8415; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8416; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8417; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8418; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8419; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8420; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8421; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8422; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8423; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8424; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8425; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8426; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8427; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8428; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8429; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8430; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
8431; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
8432; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
8433; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
8434; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8435; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
8436; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
8437; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8438; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8439; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8440; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8441; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8442; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
8443; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
8444; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
8445; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
8446; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
8447; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8448; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8449; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
8450; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
8451; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
8452; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
8453; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
8454; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8455; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8456; AVX512BW-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
8457; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rsi)
8458; AVX512BW-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
8459; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%rdx)
8460; AVX512BW-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
8461; AVX512BW-NEXT:    vmovdqa64 %zmm21, (%rcx)
8462; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
8463; AVX512BW-NEXT:    vmovdqa64 %zmm23, (%r8)
8464; AVX512BW-NEXT:    vmovdqa64 %zmm24, 64(%r9)
8465; AVX512BW-NEXT:    vmovdqa64 %zmm25, (%r9)
8466; AVX512BW-NEXT:    vmovdqa64 %zmm26, 64(%r11)
8467; AVX512BW-NEXT:    vmovdqa64 %zmm27, (%r11)
8468; AVX512BW-NEXT:    vmovdqa64 %zmm8, 64(%r10)
8469; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%r10)
8470; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
8471; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
8472; AVX512BW-NEXT:    vzeroupper
8473; AVX512BW-NEXT:    retq
8474;
8475; AVX512BW-FCP-LABEL: load_i32_stride8_vf32:
8476; AVX512BW-FCP:       # %bb.0:
8477; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8478; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
8479; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
8480; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
8481; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm29
8482; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
8483; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
8484; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm31
8485; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm3
8486; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
8487; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
8488; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm9
8489; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm5
8490; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm12
8491; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm2
8492; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm14
8493; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
8494; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm16
8495; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm15
8496; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8497; AVX512BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8498; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
8499; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
8500; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18
8501; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
8502; AVX512BW-FCP-NEXT:    movb $-64, %dil
8503; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
8504; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
8505; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
8506; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
8507; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm8
8508; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
8509; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8510; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8511; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8512; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
8513; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8514; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
8515; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8516; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8517; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
8518; AVX512BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
8519; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8520; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8521; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8522; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8523; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10
8524; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
8525; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm13
8526; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
8527; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8528; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
8529; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
8530; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm4
8531; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
8532; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8533; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8534; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
8535; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
8536; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8537; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
8538; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
8539; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
8540; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
8541; AVX512BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
8542; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8543; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8544; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8545; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8546; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8547; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8548; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8549; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8550; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8551; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8552; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8553; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8554; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8555; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8556; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8557; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8558; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8559; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8560; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8561; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8562; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8563; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8564; AVX512BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8565; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8566; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
8567; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
8568; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8569; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8570; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8571; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8572; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8573; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8574; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8575; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8576; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8577; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8578; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8579; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
8580; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8581; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8582; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8583; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8584; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8585; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8586; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8587; AVX512BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8588; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8589; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
8590; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
8591; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8592; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8593; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8594; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8595; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8596; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8597; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8598; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8599; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8600; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8601; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8602; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8603; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8604; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8605; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8606; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8607; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8608; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8609; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8610; AVX512BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8611; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8612; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8613; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8614; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8615; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8616; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8617; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8618; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8619; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8620; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8621; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8622; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8623; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8624; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8625; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8626; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8627; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8628; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8629; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8630; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8631; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8632; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8633; AVX512BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8634; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8635; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8636; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8637; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8638; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8639; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8640; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8641; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8642; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8643; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8644; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8645; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8646; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8647; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8648; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8649; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
8650; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
8651; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
8652; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
8653; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8654; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
8655; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
8656; AVX512BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8657; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8658; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8659; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8660; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8661; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
8662; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
8663; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
8664; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
8665; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
8666; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8667; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8668; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
8669; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
8670; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
8671; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
8672; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
8673; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8674; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8675; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
8676; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rsi)
8677; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
8678; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, (%rdx)
8679; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
8680; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
8681; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r8)
8682; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
8683; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, 64(%r9)
8684; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, (%r9)
8685; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r11)
8686; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, (%r11)
8687; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r10)
8688; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
8689; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
8690; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
8691; AVX512BW-FCP-NEXT:    vzeroupper
8692; AVX512BW-FCP-NEXT:    retq
8693;
8694; AVX512DQ-BW-LABEL: load_i32_stride8_vf32:
8695; AVX512DQ-BW:       # %bb.0:
8696; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8697; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
8698; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
8699; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
8700; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm29
8701; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
8702; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm30
8703; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm31
8704; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
8705; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
8706; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
8707; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm9
8708; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm5
8709; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm12
8710; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm2
8711; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm14
8712; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
8713; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm16
8714; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm15
8715; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8716; AVX512DQ-BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8717; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm17
8718; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
8719; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm18
8720; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
8721; AVX512DQ-BW-NEXT:    movb $-64, %dil
8722; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
8723; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
8724; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm10
8725; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
8726; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm8
8727; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
8728; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8729; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8730; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8731; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
8732; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8733; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
8734; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8735; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8736; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
8737; AVX512DQ-BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
8738; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8739; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8740; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8741; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8742; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm10
8743; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
8744; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm13
8745; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
8746; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8747; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm10
8748; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
8749; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm4
8750; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
8751; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8752; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8753; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm4
8754; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
8755; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8756; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
8757; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
8758; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4
8759; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
8760; AVX512DQ-BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
8761; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8762; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8763; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8764; AVX512DQ-BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8765; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8766; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8767; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8768; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8769; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8770; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8771; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8772; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8773; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8774; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8775; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8776; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8777; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8778; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8779; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8780; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8781; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8782; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8783; AVX512DQ-BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8784; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8785; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
8786; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
8787; AVX512DQ-BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8788; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8789; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8790; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8791; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8792; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8793; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8794; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8795; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8796; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8797; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8798; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
8799; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8800; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8801; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8802; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8803; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8804; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8805; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8806; AVX512DQ-BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8807; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8808; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
8809; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
8810; AVX512DQ-BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8811; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8812; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8813; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8814; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8815; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8816; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8817; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8818; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8819; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8820; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8821; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8822; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8823; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8824; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8825; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8826; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8827; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8828; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8829; AVX512DQ-BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8830; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8831; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8832; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8833; AVX512DQ-BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8834; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8835; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8836; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8837; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8838; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8839; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8840; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8841; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8842; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8843; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8844; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8845; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm8
8846; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8847; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
8848; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8849; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8850; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm8
8851; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
8852; AVX512DQ-BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8853; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8854; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8855; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8856; AVX512DQ-BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8857; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm8
8858; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8859; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm10
8860; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8861; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8862; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm8
8863; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8864; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
8865; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8866; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8867; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8868; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm10
8869; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
8870; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm13
8871; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
8872; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8873; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm10
8874; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
8875; AVX512DQ-BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
8876; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8877; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8878; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8879; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8880; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
8881; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
8882; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
8883; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
8884; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
8885; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8886; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8887; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
8888; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
8889; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
8890; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
8891; AVX512DQ-BW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
8892; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8893; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8894; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
8895; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, (%rsi)
8896; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
8897; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, (%rdx)
8898; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
8899; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, (%rcx)
8900; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
8901; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, (%r8)
8902; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, 64(%r9)
8903; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, (%r9)
8904; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, 64(%r11)
8905; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, (%r11)
8906; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 64(%r10)
8907; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, (%r10)
8908; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
8909; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rax)
8910; AVX512DQ-BW-NEXT:    vzeroupper
8911; AVX512DQ-BW-NEXT:    retq
8912;
8913; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf32:
8914; AVX512DQ-BW-FCP:       # %bb.0:
8915; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8916; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
8917; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
8918; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
8919; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm29
8920; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
8921; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
8922; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm31
8923; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm3
8924; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
8925; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
8926; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm9
8927; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm5
8928; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm12
8929; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm2
8930; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm14
8931; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
8932; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm16
8933; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm15
8934; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8935; AVX512DQ-BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8936; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
8937; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
8938; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18
8939; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
8940; AVX512DQ-BW-FCP-NEXT:    movb $-64, %dil
8941; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
8942; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
8943; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
8944; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
8945; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm8
8946; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
8947; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8948; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8949; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8950; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
8951; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8952; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
8953; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8954; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
8955; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
8956; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
8957; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8958; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8959; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8960; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8961; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10
8962; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
8963; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm13
8964; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
8965; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
8966; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
8967; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
8968; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm4
8969; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
8970; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8971; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8972; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
8973; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
8974; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8975; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
8976; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
8977; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
8978; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
8979; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
8980; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8981; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8982; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8983; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8984; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
8985; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
8986; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
8987; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
8988; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
8989; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
8990; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
8991; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
8992; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
8993; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8994; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8995; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
8996; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
8997; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
8998; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
8999; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
9000; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
9001; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
9002; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
9003; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
9004; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
9005; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
9006; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9007; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
9008; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
9009; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
9010; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
9011; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
9012; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
9013; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
9014; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
9015; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
9016; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9017; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
9018; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
9019; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
9020; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
9021; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
9022; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
9023; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
9024; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
9025; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
9026; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
9027; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
9028; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
9029; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9030; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
9031; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
9032; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
9033; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
9034; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
9035; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
9036; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
9037; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
9038; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
9039; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9040; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
9041; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
9042; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
9043; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
9044; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
9045; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
9046; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
9047; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
9048; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
9049; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
9050; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
9051; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
9052; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9053; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
9054; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
9055; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
9056; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
9057; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
9058; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
9059; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
9060; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
9061; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
9062; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9063; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
9064; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
9065; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
9066; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
9067; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
9068; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
9069; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
9070; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
9071; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
9072; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
9073; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
9074; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
9075; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9076; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm8
9077; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
9078; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10
9079; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
9080; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
9081; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
9082; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
9083; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
9084; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
9085; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9086; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
9087; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
9088; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
9089; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
9090; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
9091; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
9092; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
9093; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
9094; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
9095; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
9096; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
9097; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
9098; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9099; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
9100; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
9101; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
9102; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
9103; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
9104; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
9105; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
9106; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
9107; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
9108; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
9109; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
9110; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
9111; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9112; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
9113; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
9114; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rsi)
9115; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
9116; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, (%rdx)
9117; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
9118; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
9119; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r8)
9120; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
9121; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, 64(%r9)
9122; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, (%r9)
9123; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r11)
9124; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, (%r11)
9125; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r10)
9126; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
9127; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
9128; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
9129; AVX512DQ-BW-FCP-NEXT:    vzeroupper
9130; AVX512DQ-BW-FCP-NEXT:    retq
9131  %wide.vec = load <256 x i32>, ptr %in.vec, align 64
9132  %strided.vec0 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248>
9133  %strided.vec1 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249>
9134  %strided.vec2 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250>
9135  %strided.vec3 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251>
9136  %strided.vec4 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252>
9137  %strided.vec5 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253>
9138  %strided.vec6 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254>
9139  %strided.vec7 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255>
9140  store <32 x i32> %strided.vec0, ptr %out.vec0, align 64
9141  store <32 x i32> %strided.vec1, ptr %out.vec1, align 64
9142  store <32 x i32> %strided.vec2, ptr %out.vec2, align 64
9143  store <32 x i32> %strided.vec3, ptr %out.vec3, align 64
9144  store <32 x i32> %strided.vec4, ptr %out.vec4, align 64
9145  store <32 x i32> %strided.vec5, ptr %out.vec5, align 64
9146  store <32 x i32> %strided.vec6, ptr %out.vec6, align 64
9147  store <32 x i32> %strided.vec7, ptr %out.vec7, align 64
9148  ret void
9149}
9150
9151define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
9152; SSE-LABEL: load_i32_stride8_vf64:
9153; SSE:       # %bb.0:
9154; SSE-NEXT:    subq $2232, %rsp # imm = 0x8B8
9155; SSE-NEXT:    movaps 288(%rdi), %xmm4
9156; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9157; SSE-NEXT:    movaps 352(%rdi), %xmm5
9158; SSE-NEXT:    movaps %xmm5, (%rsp) # 16-byte Spill
9159; SSE-NEXT:    movaps 320(%rdi), %xmm6
9160; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9161; SSE-NEXT:    movaps 416(%rdi), %xmm7
9162; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9163; SSE-NEXT:    movaps 384(%rdi), %xmm8
9164; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9165; SSE-NEXT:    movaps 480(%rdi), %xmm9
9166; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9167; SSE-NEXT:    movaps 448(%rdi), %xmm3
9168; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9169; SSE-NEXT:    movaps 160(%rdi), %xmm10
9170; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9171; SSE-NEXT:    movaps 128(%rdi), %xmm1
9172; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9173; SSE-NEXT:    movaps 224(%rdi), %xmm2
9174; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9175; SSE-NEXT:    movaps 192(%rdi), %xmm0
9176; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9177; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9178; SSE-NEXT:    movaps %xmm1, %xmm2
9179; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
9180; SSE-NEXT:    movaps %xmm2, %xmm1
9181; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
9182; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9183; SSE-NEXT:    movaps %xmm3, %xmm1
9184; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
9185; SSE-NEXT:    movaps %xmm8, %xmm3
9186; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
9187; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
9188; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9189; SSE-NEXT:    movaps %xmm3, %xmm0
9190; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
9191; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9192; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
9193; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9194; SSE-NEXT:    movaps %xmm6, %xmm0
9195; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
9196; SSE-NEXT:    movaps 256(%rdi), %xmm2
9197; SSE-NEXT:    movaps %xmm2, %xmm1
9198; SSE-NEXT:    movaps %xmm2, %xmm3
9199; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
9200; SSE-NEXT:    movaps %xmm1, %xmm2
9201; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9202; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9203; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9204; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9205; SSE-NEXT:    movaps 736(%rdi), %xmm9
9206; SSE-NEXT:    movaps 704(%rdi), %xmm0
9207; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9208; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9209; SSE-NEXT:    movaps 672(%rdi), %xmm2
9210; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9211; SSE-NEXT:    movaps 640(%rdi), %xmm1
9212; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9213; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9214; SSE-NEXT:    movaps %xmm1, %xmm2
9215; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9216; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9217; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9218; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9219; SSE-NEXT:    movaps 608(%rdi), %xmm2
9220; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9221; SSE-NEXT:    movaps 576(%rdi), %xmm1
9222; SSE-NEXT:    movaps %xmm1, %xmm0
9223; SSE-NEXT:    movaps %xmm1, %xmm4
9224; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9225; SSE-NEXT:    movaps 544(%rdi), %xmm15
9226; SSE-NEXT:    movaps 512(%rdi), %xmm2
9227; SSE-NEXT:    movaps %xmm2, %xmm1
9228; SSE-NEXT:    movaps %xmm2, %xmm6
9229; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
9230; SSE-NEXT:    movaps %xmm1, %xmm2
9231; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9232; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9233; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9234; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9235; SSE-NEXT:    movaps 992(%rdi), %xmm1
9236; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9237; SSE-NEXT:    movaps 960(%rdi), %xmm0
9238; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9239; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9240; SSE-NEXT:    movaps 928(%rdi), %xmm2
9241; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9242; SSE-NEXT:    movaps 896(%rdi), %xmm1
9243; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9244; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9245; SSE-NEXT:    movaps %xmm1, %xmm2
9246; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9247; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9248; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9249; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9250; SSE-NEXT:    movaps 864(%rdi), %xmm1
9251; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9252; SSE-NEXT:    movaps 832(%rdi), %xmm0
9253; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9254; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9255; SSE-NEXT:    movaps 800(%rdi), %xmm14
9256; SSE-NEXT:    movaps 768(%rdi), %xmm2
9257; SSE-NEXT:    movaps %xmm2, %xmm1
9258; SSE-NEXT:    movaps %xmm2, %xmm8
9259; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
9260; SSE-NEXT:    movaps %xmm1, %xmm2
9261; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9262; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9263; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9264; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9265; SSE-NEXT:    movaps 1248(%rdi), %xmm1
9266; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9267; SSE-NEXT:    movaps 1216(%rdi), %xmm0
9268; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9269; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9270; SSE-NEXT:    movaps 1184(%rdi), %xmm2
9271; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9272; SSE-NEXT:    movaps 1152(%rdi), %xmm1
9273; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9274; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9275; SSE-NEXT:    movaps %xmm1, %xmm2
9276; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9277; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9278; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9279; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9280; SSE-NEXT:    movaps 1120(%rdi), %xmm2
9281; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9282; SSE-NEXT:    movaps 1088(%rdi), %xmm1
9283; SSE-NEXT:    movaps %xmm1, %xmm0
9284; SSE-NEXT:    movaps %xmm1, %xmm7
9285; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9286; SSE-NEXT:    movaps 1056(%rdi), %xmm2
9287; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9288; SSE-NEXT:    movaps 1024(%rdi), %xmm1
9289; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9290; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9291; SSE-NEXT:    movaps %xmm1, %xmm2
9292; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9293; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9294; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9295; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9296; SSE-NEXT:    movaps 1504(%rdi), %xmm1
9297; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9298; SSE-NEXT:    movaps 1472(%rdi), %xmm0
9299; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9300; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9301; SSE-NEXT:    movaps 1440(%rdi), %xmm2
9302; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9303; SSE-NEXT:    movaps 1408(%rdi), %xmm1
9304; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9305; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9306; SSE-NEXT:    movaps %xmm1, %xmm2
9307; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9308; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9309; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9310; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9311; SSE-NEXT:    movaps 1376(%rdi), %xmm1
9312; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9313; SSE-NEXT:    movaps 1344(%rdi), %xmm0
9314; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9315; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9316; SSE-NEXT:    movaps 1312(%rdi), %xmm2
9317; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9318; SSE-NEXT:    movaps 1280(%rdi), %xmm1
9319; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9320; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9321; SSE-NEXT:    movaps %xmm1, %xmm2
9322; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9323; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9324; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9325; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9326; SSE-NEXT:    movaps 1760(%rdi), %xmm1
9327; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9328; SSE-NEXT:    movaps 1728(%rdi), %xmm0
9329; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9330; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9331; SSE-NEXT:    movaps 1696(%rdi), %xmm2
9332; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9333; SSE-NEXT:    movaps 1664(%rdi), %xmm1
9334; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9335; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9336; SSE-NEXT:    movaps %xmm1, %xmm2
9337; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9338; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9339; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9340; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9341; SSE-NEXT:    movaps 1632(%rdi), %xmm1
9342; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9343; SSE-NEXT:    movaps 1600(%rdi), %xmm0
9344; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9345; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9346; SSE-NEXT:    movaps 1568(%rdi), %xmm5
9347; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9348; SSE-NEXT:    movaps 1536(%rdi), %xmm2
9349; SSE-NEXT:    movaps %xmm2, %xmm1
9350; SSE-NEXT:    movaps %xmm2, %xmm13
9351; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
9352; SSE-NEXT:    movaps %xmm1, %xmm2
9353; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9354; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9355; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9356; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9357; SSE-NEXT:    movaps 2016(%rdi), %xmm1
9358; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9359; SSE-NEXT:    movaps 1984(%rdi), %xmm0
9360; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9361; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9362; SSE-NEXT:    movaps 1952(%rdi), %xmm2
9363; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9364; SSE-NEXT:    movaps 1920(%rdi), %xmm1
9365; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9366; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9367; SSE-NEXT:    movaps %xmm1, %xmm2
9368; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9369; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9370; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9371; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9372; SSE-NEXT:    movaps 1888(%rdi), %xmm1
9373; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9374; SSE-NEXT:    movaps 1856(%rdi), %xmm0
9375; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9376; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9377; SSE-NEXT:    movaps 1824(%rdi), %xmm2
9378; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9379; SSE-NEXT:    movaps 1792(%rdi), %xmm1
9380; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9381; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9382; SSE-NEXT:    movaps %xmm1, %xmm5
9383; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
9384; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9385; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9386; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9387; SSE-NEXT:    movaps 96(%rdi), %xmm2
9388; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9389; SSE-NEXT:    movaps 64(%rdi), %xmm12
9390; SSE-NEXT:    movaps %xmm12, %xmm0
9391; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9392; SSE-NEXT:    movaps (%rdi), %xmm10
9393; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9394; SSE-NEXT:    movaps 32(%rdi), %xmm1
9395; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9396; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
9397; SSE-NEXT:    movaps %xmm10, %xmm5
9398; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
9399; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9400; SSE-NEXT:    unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
9401; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9402; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9403; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9404; SSE-NEXT:    # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
9405; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9406; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9407; SSE-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
9408; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9409; SSE-NEXT:    unpckhps (%rsp), %xmm10 # 16-byte Folded Reload
9410; SSE-NEXT:    # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
9411; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9412; SSE-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9413; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9414; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9415; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9416; SSE-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9417; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9418; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9419; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9420; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9421; SSE-NEXT:    # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
9422; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9423; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
9424; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9425; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9426; SSE-NEXT:    unpckhps {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3]
9427; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9428; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9429; SSE-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9430; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9431; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9432; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9433; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9434; SSE-NEXT:    unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3]
9435; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9436; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9437; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9438; SSE-NEXT:    # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
9439; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9440; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9441; SSE-NEXT:    # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
9442; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9443; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9444; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9445; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9446; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9447; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9448; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9449; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9450; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9451; SSE-NEXT:    # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
9452; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9453; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9454; SSE-NEXT:    # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
9455; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9456; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9457; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9458; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9459; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9460; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9461; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9462; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9463; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9464; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9465; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9466; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9467; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9468; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9469; SSE-NEXT:    # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
9470; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9471; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9472; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9473; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9474; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9475; SSE-NEXT:    # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
9476; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9477; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9478; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9479; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9480; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9481; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9482; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9483; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9484; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9485; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9486; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9487; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9488; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9489; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9490; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9491; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9492; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9493; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9494; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9495; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9496; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9497; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9498; SSE-NEXT:    # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
9499; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9500; SSE-NEXT:    # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
9501; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9502; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9503; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9504; SSE-NEXT:    # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
9505; SSE-NEXT:    movaps %xmm5, %xmm7
9506; SSE-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0]
9507; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9508; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1]
9509; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9510; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9511; SSE-NEXT:    movaps %xmm5, %xmm7
9512; SSE-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0]
9513; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9514; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1]
9515; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9516; SSE-NEXT:    movaps %xmm0, %xmm5
9517; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
9518; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9519; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
9520; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9521; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9522; SSE-NEXT:    movaps %xmm0, %xmm5
9523; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9524; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
9525; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9526; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
9527; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9528; SSE-NEXT:    movaps %xmm1, %xmm5
9529; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0]
9530; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9531; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1]
9532; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9533; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9534; SSE-NEXT:    movaps %xmm0, %xmm5
9535; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9536; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9537; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9538; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9539; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9540; SSE-NEXT:    movaps %xmm9, %xmm5
9541; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0]
9542; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9543; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1]
9544; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9545; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9546; SSE-NEXT:    movaps %xmm0, %xmm5
9547; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9548; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9549; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9550; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9551; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9552; SSE-NEXT:    movaps %xmm6, %xmm5
9553; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0]
9554; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9555; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
9556; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9557; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9558; SSE-NEXT:    movaps %xmm0, %xmm5
9559; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9560; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9561; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9562; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9563; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9564; SSE-NEXT:    movaps %xmm4, %xmm5
9565; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9566; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9567; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9568; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
9569; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9570; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9571; SSE-NEXT:    movaps %xmm0, %xmm5
9572; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9573; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9574; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9575; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9576; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9577; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9578; SSE-NEXT:    movaps %xmm0, %xmm4
9579; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
9580; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9581; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
9582; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9583; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9584; SSE-NEXT:    movaps %xmm6, %xmm0
9585; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9586; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
9587; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9588; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
9589; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9590; SSE-NEXT:    movaps %xmm13, %xmm0
9591; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9592; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
9593; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9594; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1]
9595; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9596; SSE-NEXT:    movaps %xmm12, %xmm0
9597; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9598; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
9599; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9600; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1]
9601; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9602; SSE-NEXT:    movaps 240(%rdi), %xmm2
9603; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9604; SSE-NEXT:    movaps 208(%rdi), %xmm7
9605; SSE-NEXT:    movaps %xmm7, %xmm0
9606; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9607; SSE-NEXT:    movaps 176(%rdi), %xmm3
9608; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9609; SSE-NEXT:    movaps 144(%rdi), %xmm2
9610; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9611; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
9612; SSE-NEXT:    movaps %xmm2, %xmm1
9613; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
9614; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9615; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
9616; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9617; SSE-NEXT:    movaps 368(%rdi), %xmm3
9618; SSE-NEXT:    movaps 336(%rdi), %xmm0
9619; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
9620; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
9621; SSE-NEXT:    movaps 304(%rdi), %xmm4
9622; SSE-NEXT:    movaps 272(%rdi), %xmm1
9623; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9624; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
9625; SSE-NEXT:    movaps %xmm1, %xmm2
9626; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9627; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9628; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9629; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9630; SSE-NEXT:    movaps 496(%rdi), %xmm5
9631; SSE-NEXT:    movaps 464(%rdi), %xmm0
9632; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9633; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
9634; SSE-NEXT:    movaps 432(%rdi), %xmm6
9635; SSE-NEXT:    movaps 400(%rdi), %xmm1
9636; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9637; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
9638; SSE-NEXT:    movaps %xmm1, %xmm2
9639; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9640; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9641; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9642; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9643; SSE-NEXT:    movaps 624(%rdi), %xmm9
9644; SSE-NEXT:    movaps 592(%rdi), %xmm0
9645; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9646; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9647; SSE-NEXT:    movaps 560(%rdi), %xmm10
9648; SSE-NEXT:    movaps 528(%rdi), %xmm1
9649; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9650; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
9651; SSE-NEXT:    movaps %xmm1, %xmm2
9652; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9653; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9654; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9655; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9656; SSE-NEXT:    movaps 752(%rdi), %xmm12
9657; SSE-NEXT:    movaps 720(%rdi), %xmm0
9658; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9659; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
9660; SSE-NEXT:    movaps 688(%rdi), %xmm13
9661; SSE-NEXT:    movaps 656(%rdi), %xmm2
9662; SSE-NEXT:    movaps %xmm2, %xmm1
9663; SSE-NEXT:    movaps %xmm2, %xmm15
9664; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
9665; SSE-NEXT:    movaps %xmm1, %xmm2
9666; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9667; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9668; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9669; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9670; SSE-NEXT:    movaps 880(%rdi), %xmm1
9671; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9672; SSE-NEXT:    movaps 848(%rdi), %xmm0
9673; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9674; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9675; SSE-NEXT:    movaps 816(%rdi), %xmm2
9676; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9677; SSE-NEXT:    movaps 784(%rdi), %xmm1
9678; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9679; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9680; SSE-NEXT:    movaps %xmm1, %xmm2
9681; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9682; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9683; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9684; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9685; SSE-NEXT:    movaps 1008(%rdi), %xmm14
9686; SSE-NEXT:    movaps 976(%rdi), %xmm0
9687; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9688; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
9689; SSE-NEXT:    movaps 944(%rdi), %xmm2
9690; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9691; SSE-NEXT:    movaps 912(%rdi), %xmm1
9692; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9693; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9694; SSE-NEXT:    movaps %xmm1, %xmm2
9695; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9696; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9697; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9698; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9699; SSE-NEXT:    movaps 1136(%rdi), %xmm1
9700; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9701; SSE-NEXT:    movaps 1104(%rdi), %xmm0
9702; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9703; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9704; SSE-NEXT:    movaps 1072(%rdi), %xmm2
9705; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9706; SSE-NEXT:    movaps 1040(%rdi), %xmm1
9707; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9708; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9709; SSE-NEXT:    movaps %xmm1, %xmm2
9710; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9711; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9712; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9713; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9714; SSE-NEXT:    movaps 1264(%rdi), %xmm11
9715; SSE-NEXT:    movaps 1232(%rdi), %xmm0
9716; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9717; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
9718; SSE-NEXT:    movaps 1200(%rdi), %xmm2
9719; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9720; SSE-NEXT:    movaps 1168(%rdi), %xmm1
9721; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9722; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9723; SSE-NEXT:    movaps %xmm1, %xmm2
9724; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9725; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9726; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9727; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9728; SSE-NEXT:    movaps 1392(%rdi), %xmm1
9729; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9730; SSE-NEXT:    movaps 1360(%rdi), %xmm0
9731; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9732; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9733; SSE-NEXT:    movaps 1328(%rdi), %xmm2
9734; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9735; SSE-NEXT:    movaps 1296(%rdi), %xmm1
9736; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9737; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9738; SSE-NEXT:    movaps %xmm1, %xmm2
9739; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9740; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9741; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9742; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9743; SSE-NEXT:    movaps 1520(%rdi), %xmm1
9744; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9745; SSE-NEXT:    movaps 1488(%rdi), %xmm0
9746; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9747; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9748; SSE-NEXT:    movaps 1456(%rdi), %xmm2
9749; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9750; SSE-NEXT:    movaps 1424(%rdi), %xmm1
9751; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9752; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9753; SSE-NEXT:    movaps %xmm1, %xmm2
9754; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9755; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9756; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9757; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9758; SSE-NEXT:    movaps 1648(%rdi), %xmm1
9759; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9760; SSE-NEXT:    movaps 1616(%rdi), %xmm0
9761; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9762; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9763; SSE-NEXT:    movaps 1584(%rdi), %xmm2
9764; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9765; SSE-NEXT:    movaps 1552(%rdi), %xmm1
9766; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9767; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9768; SSE-NEXT:    movaps %xmm1, %xmm2
9769; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9770; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9771; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9772; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9773; SSE-NEXT:    movaps 1776(%rdi), %xmm8
9774; SSE-NEXT:    movaps 1744(%rdi), %xmm0
9775; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9776; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
9777; SSE-NEXT:    movaps 1712(%rdi), %xmm2
9778; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9779; SSE-NEXT:    movaps 1680(%rdi), %xmm1
9780; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9781; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9782; SSE-NEXT:    movaps %xmm1, %xmm2
9783; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9784; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9785; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9786; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9787; SSE-NEXT:    movaps 1904(%rdi), %xmm1
9788; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9789; SSE-NEXT:    movaps 1872(%rdi), %xmm0
9790; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9791; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9792; SSE-NEXT:    movaps 1840(%rdi), %xmm2
9793; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9794; SSE-NEXT:    movaps 1808(%rdi), %xmm1
9795; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9796; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9797; SSE-NEXT:    movaps %xmm1, %xmm2
9798; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9799; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9800; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9801; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9802; SSE-NEXT:    movaps 2032(%rdi), %xmm1
9803; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9804; SSE-NEXT:    movaps 2000(%rdi), %xmm0
9805; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9806; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9807; SSE-NEXT:    movaps 1968(%rdi), %xmm2
9808; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9809; SSE-NEXT:    movaps 1936(%rdi), %xmm1
9810; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9811; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9812; SSE-NEXT:    movaps %xmm1, %xmm2
9813; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9814; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9815; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9816; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9817; SSE-NEXT:    movaps 112(%rdi), %xmm1
9818; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9819; SSE-NEXT:    movaps 80(%rdi), %xmm0
9820; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9821; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9822; SSE-NEXT:    movaps 16(%rdi), %xmm1
9823; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9824; SSE-NEXT:    movaps 48(%rdi), %xmm2
9825; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9826; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9827; SSE-NEXT:    movaps %xmm1, %xmm2
9828; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9829; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9830; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9831; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9832; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9833; SSE-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9834; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9835; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9836; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9837; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
9838; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
9839; SSE-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
9840; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9841; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
9842; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9843; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9844; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
9845; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9846; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9847; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
9848; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9849; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
9850; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9851; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9852; SSE-NEXT:    unpckhps {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
9853; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9854; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3]
9855; SSE-NEXT:    movaps %xmm15, %xmm4
9856; SSE-NEXT:    unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
9857; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9858; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9859; SSE-NEXT:    # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
9860; SSE-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9861; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9862; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9863; SSE-NEXT:    # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
9864; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9865; SSE-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3]
9866; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9867; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9868; SSE-NEXT:    # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
9869; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9870; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9871; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9872; SSE-NEXT:    # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
9873; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9874; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9875; SSE-NEXT:    # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
9876; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9877; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9878; SSE-NEXT:    unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
9879; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9880; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9881; SSE-NEXT:    # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
9882; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9883; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9884; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9885; SSE-NEXT:    # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
9886; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9887; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9888; SSE-NEXT:    # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
9889; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9890; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9891; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9892; SSE-NEXT:    # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
9893; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9894; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9895; SSE-NEXT:    # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
9896; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9897; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9898; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9899; SSE-NEXT:    # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
9900; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9901; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9902; SSE-NEXT:    # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
9903; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9904; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9905; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
9906; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9907; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9908; SSE-NEXT:    # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
9909; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9910; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9911; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9912; SSE-NEXT:    # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
9913; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9914; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9915; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9916; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9917; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9918; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9919; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9920; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9921; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9922; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9923; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9924; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9925; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9926; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9927; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9928; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9929; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9930; SSE-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9931; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9932; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9933; SSE-NEXT:    movaps %xmm2, %xmm0
9934; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
9935; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9936; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1]
9937; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9938; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9939; SSE-NEXT:    movaps %xmm0, %xmm2
9940; SSE-NEXT:    movaps (%rsp), %xmm7 # 16-byte Reload
9941; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0]
9942; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9943; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
9944; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9945; SSE-NEXT:    movaps %xmm1, %xmm0
9946; SSE-NEXT:    movaps %xmm1, %xmm2
9947; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9948; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
9949; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9950; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9951; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9952; SSE-NEXT:    movaps %xmm5, %xmm2
9953; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9954; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
9955; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9956; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
9957; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9958; SSE-NEXT:    movaps %xmm4, %xmm2
9959; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
9960; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9961; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
9962; SSE-NEXT:    movaps %xmm4, (%rsp) # 16-byte Spill
9963; SSE-NEXT:    movaps %xmm15, %xmm0
9964; SSE-NEXT:    movaps %xmm15, %xmm2
9965; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9966; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
9967; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9968; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9969; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9970; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9971; SSE-NEXT:    movaps %xmm0, %xmm15
9972; SSE-NEXT:    movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
9973; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
9974; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9975; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9976; SSE-NEXT:    movaps %xmm0, %xmm2
9977; SSE-NEXT:    movaps %xmm14, %xmm1
9978; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0]
9979; SSE-NEXT:    movaps %xmm2, %xmm14
9980; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9981; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9982; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9983; SSE-NEXT:    movaps %xmm0, %xmm2
9984; SSE-NEXT:    movaps %xmm13, %xmm1
9985; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0]
9986; SSE-NEXT:    movaps %xmm2, %xmm13
9987; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9988; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9989; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9990; SSE-NEXT:    movaps %xmm0, %xmm7
9991; SSE-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0]
9992; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1]
9993; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9994; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9995; SSE-NEXT:    movaps %xmm0, %xmm6
9996; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0]
9997; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1]
9998; SSE-NEXT:    movaps %xmm0, %xmm12
9999; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10000; SSE-NEXT:    movaps %xmm0, %xmm2
10001; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0]
10002; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1]
10003; SSE-NEXT:    movaps %xmm0, %xmm10
10004; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10005; SSE-NEXT:    movaps %xmm0, %xmm5
10006; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0]
10007; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1]
10008; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10009; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10010; SSE-NEXT:    movaps %xmm0, %xmm4
10011; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0]
10012; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
10013; SSE-NEXT:    movaps %xmm0, %xmm8
10014; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10015; SSE-NEXT:    movaps %xmm0, %xmm3
10016; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10017; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
10018; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
10019; SSE-NEXT:    movaps %xmm0, %xmm9
10020; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10021; SSE-NEXT:    movaps %xmm11, %xmm0
10022; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10023; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
10024; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1]
10025; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10026; SSE-NEXT:    movaps %xmm1, 224(%rsi)
10027; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10028; SSE-NEXT:    movaps %xmm1, 160(%rsi)
10029; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10030; SSE-NEXT:    movaps %xmm1, 96(%rsi)
10031; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10032; SSE-NEXT:    movaps %xmm1, 32(%rsi)
10033; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10034; SSE-NEXT:    movaps %xmm1, 240(%rsi)
10035; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10036; SSE-NEXT:    movaps %xmm1, 176(%rsi)
10037; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10038; SSE-NEXT:    movaps %xmm1, 112(%rsi)
10039; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10040; SSE-NEXT:    movaps %xmm1, 48(%rsi)
10041; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10042; SSE-NEXT:    movaps %xmm1, 192(%rsi)
10043; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10044; SSE-NEXT:    movaps %xmm1, 128(%rsi)
10045; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10046; SSE-NEXT:    movaps %xmm1, 64(%rsi)
10047; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10048; SSE-NEXT:    movaps %xmm1, (%rsi)
10049; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10050; SSE-NEXT:    movaps %xmm1, 208(%rsi)
10051; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10052; SSE-NEXT:    movaps %xmm1, 144(%rsi)
10053; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10054; SSE-NEXT:    movaps %xmm1, 80(%rsi)
10055; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10056; SSE-NEXT:    movaps %xmm1, 16(%rsi)
10057; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10058; SSE-NEXT:    movaps %xmm1, 224(%rdx)
10059; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10060; SSE-NEXT:    movaps %xmm1, 240(%rdx)
10061; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10062; SSE-NEXT:    movaps %xmm1, 192(%rdx)
10063; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10064; SSE-NEXT:    movaps %xmm1, 208(%rdx)
10065; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10066; SSE-NEXT:    movaps %xmm1, 160(%rdx)
10067; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10068; SSE-NEXT:    movaps %xmm1, 176(%rdx)
10069; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10070; SSE-NEXT:    movaps %xmm1, 128(%rdx)
10071; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10072; SSE-NEXT:    movaps %xmm1, 144(%rdx)
10073; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10074; SSE-NEXT:    movaps %xmm1, 96(%rdx)
10075; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10076; SSE-NEXT:    movaps %xmm1, 112(%rdx)
10077; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10078; SSE-NEXT:    movaps %xmm1, 64(%rdx)
10079; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10080; SSE-NEXT:    movaps %xmm1, 80(%rdx)
10081; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10082; SSE-NEXT:    movaps %xmm1, 32(%rdx)
10083; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10084; SSE-NEXT:    movaps %xmm1, 48(%rdx)
10085; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10086; SSE-NEXT:    movaps %xmm1, (%rdx)
10087; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10088; SSE-NEXT:    movaps %xmm1, 16(%rdx)
10089; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10090; SSE-NEXT:    movaps %xmm1, 240(%rcx)
10091; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10092; SSE-NEXT:    movaps %xmm1, 224(%rcx)
10093; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10094; SSE-NEXT:    movaps %xmm1, 208(%rcx)
10095; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10096; SSE-NEXT:    movaps %xmm1, 192(%rcx)
10097; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10098; SSE-NEXT:    movaps %xmm1, 176(%rcx)
10099; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10100; SSE-NEXT:    movaps %xmm1, 160(%rcx)
10101; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10102; SSE-NEXT:    movaps %xmm1, 144(%rcx)
10103; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10104; SSE-NEXT:    movaps %xmm1, 128(%rcx)
10105; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10106; SSE-NEXT:    movaps %xmm1, 112(%rcx)
10107; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10108; SSE-NEXT:    movaps %xmm1, 96(%rcx)
10109; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10110; SSE-NEXT:    movaps %xmm1, 80(%rcx)
10111; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10112; SSE-NEXT:    movaps %xmm1, 64(%rcx)
10113; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10114; SSE-NEXT:    movaps %xmm1, 48(%rcx)
10115; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10116; SSE-NEXT:    movaps %xmm1, 32(%rcx)
10117; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10118; SSE-NEXT:    movaps %xmm1, 16(%rcx)
10119; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10120; SSE-NEXT:    movaps %xmm1, (%rcx)
10121; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10122; SSE-NEXT:    movaps %xmm1, 240(%r8)
10123; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10124; SSE-NEXT:    movaps %xmm1, 224(%r8)
10125; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10126; SSE-NEXT:    movaps %xmm1, 208(%r8)
10127; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10128; SSE-NEXT:    movaps %xmm1, 192(%r8)
10129; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10130; SSE-NEXT:    movaps %xmm1, 176(%r8)
10131; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10132; SSE-NEXT:    movaps %xmm1, 160(%r8)
10133; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10134; SSE-NEXT:    movaps %xmm1, 144(%r8)
10135; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10136; SSE-NEXT:    movaps %xmm1, 128(%r8)
10137; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10138; SSE-NEXT:    movaps %xmm1, 112(%r8)
10139; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10140; SSE-NEXT:    movaps %xmm1, 96(%r8)
10141; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10142; SSE-NEXT:    movaps %xmm1, 80(%r8)
10143; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10144; SSE-NEXT:    movaps %xmm1, 64(%r8)
10145; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10146; SSE-NEXT:    movaps %xmm1, 48(%r8)
10147; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10148; SSE-NEXT:    movaps %xmm1, 32(%r8)
10149; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10150; SSE-NEXT:    movaps %xmm1, 16(%r8)
10151; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10152; SSE-NEXT:    movaps %xmm1, (%r8)
10153; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10154; SSE-NEXT:    movaps %xmm1, 240(%r9)
10155; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10156; SSE-NEXT:    movaps %xmm1, 224(%r9)
10157; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10158; SSE-NEXT:    movaps %xmm1, 208(%r9)
10159; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10160; SSE-NEXT:    movaps %xmm1, 192(%r9)
10161; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10162; SSE-NEXT:    movaps %xmm1, 176(%r9)
10163; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10164; SSE-NEXT:    movaps %xmm1, 160(%r9)
10165; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10166; SSE-NEXT:    movaps %xmm1, 144(%r9)
10167; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10168; SSE-NEXT:    movaps %xmm1, 128(%r9)
10169; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10170; SSE-NEXT:    movaps %xmm1, 112(%r9)
10171; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10172; SSE-NEXT:    movaps %xmm1, 96(%r9)
10173; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10174; SSE-NEXT:    movaps %xmm1, 80(%r9)
10175; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10176; SSE-NEXT:    movaps %xmm1, 64(%r9)
10177; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10178; SSE-NEXT:    movaps %xmm1, 48(%r9)
10179; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10180; SSE-NEXT:    movaps %xmm1, 32(%r9)
10181; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10182; SSE-NEXT:    movaps %xmm1, 16(%r9)
10183; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10184; SSE-NEXT:    movaps %xmm1, (%r9)
10185; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10186; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10187; SSE-NEXT:    movaps %xmm1, 240(%rax)
10188; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10189; SSE-NEXT:    movaps %xmm1, 224(%rax)
10190; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10191; SSE-NEXT:    movaps %xmm1, 208(%rax)
10192; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10193; SSE-NEXT:    movaps %xmm1, 192(%rax)
10194; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10195; SSE-NEXT:    movaps %xmm1, 176(%rax)
10196; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10197; SSE-NEXT:    movaps %xmm1, 160(%rax)
10198; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10199; SSE-NEXT:    movaps %xmm1, 144(%rax)
10200; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10201; SSE-NEXT:    movaps %xmm1, 128(%rax)
10202; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10203; SSE-NEXT:    movaps %xmm1, 112(%rax)
10204; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10205; SSE-NEXT:    movaps %xmm1, 96(%rax)
10206; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10207; SSE-NEXT:    movaps %xmm1, 80(%rax)
10208; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10209; SSE-NEXT:    movaps %xmm1, 64(%rax)
10210; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10211; SSE-NEXT:    movaps %xmm1, 48(%rax)
10212; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10213; SSE-NEXT:    movaps %xmm1, 32(%rax)
10214; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10215; SSE-NEXT:    movaps %xmm1, 16(%rax)
10216; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10217; SSE-NEXT:    movaps %xmm1, (%rax)
10218; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10219; SSE-NEXT:    movaps %xmm3, 240(%rax)
10220; SSE-NEXT:    movaps %xmm4, 224(%rax)
10221; SSE-NEXT:    movaps %xmm5, 208(%rax)
10222; SSE-NEXT:    movaps %xmm2, 192(%rax)
10223; SSE-NEXT:    movaps %xmm6, 176(%rax)
10224; SSE-NEXT:    movaps %xmm7, 160(%rax)
10225; SSE-NEXT:    movaps %xmm13, 144(%rax)
10226; SSE-NEXT:    movaps %xmm14, 128(%rax)
10227; SSE-NEXT:    movaps %xmm15, 112(%rax)
10228; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10229; SSE-NEXT:    movaps %xmm1, 96(%rax)
10230; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10231; SSE-NEXT:    movaps %xmm1, 80(%rax)
10232; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10233; SSE-NEXT:    movaps %xmm1, 64(%rax)
10234; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10235; SSE-NEXT:    movaps %xmm1, 48(%rax)
10236; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10237; SSE-NEXT:    movaps %xmm1, 32(%rax)
10238; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10239; SSE-NEXT:    movaps %xmm1, 16(%rax)
10240; SSE-NEXT:    movaps %xmm0, (%rax)
10241; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10242; SSE-NEXT:    movaps %xmm9, 240(%rax)
10243; SSE-NEXT:    movaps %xmm8, 224(%rax)
10244; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10245; SSE-NEXT:    movaps %xmm0, 208(%rax)
10246; SSE-NEXT:    movaps %xmm10, 192(%rax)
10247; SSE-NEXT:    movaps %xmm12, 176(%rax)
10248; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10249; SSE-NEXT:    movaps %xmm0, 160(%rax)
10250; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10251; SSE-NEXT:    movaps %xmm0, 144(%rax)
10252; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10253; SSE-NEXT:    movaps %xmm0, 128(%rax)
10254; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10255; SSE-NEXT:    movaps %xmm0, 112(%rax)
10256; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10257; SSE-NEXT:    movaps %xmm0, 96(%rax)
10258; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
10259; SSE-NEXT:    movaps %xmm0, 80(%rax)
10260; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10261; SSE-NEXT:    movaps %xmm0, 64(%rax)
10262; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10263; SSE-NEXT:    movaps %xmm0, 48(%rax)
10264; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10265; SSE-NEXT:    movaps %xmm0, 32(%rax)
10266; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10267; SSE-NEXT:    movaps %xmm0, 16(%rax)
10268; SSE-NEXT:    movaps %xmm11, (%rax)
10269; SSE-NEXT:    addq $2232, %rsp # imm = 0x8B8
10270; SSE-NEXT:    retq
10271;
10272; AVX-LABEL: load_i32_stride8_vf64:
10273; AVX:       # %bb.0:
10274; AVX-NEXT:    subq $3720, %rsp # imm = 0xE88
10275; AVX-NEXT:    vmovaps 288(%rdi), %xmm13
10276; AVX-NEXT:    vmovaps 256(%rdi), %xmm15
10277; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
10278; AVX-NEXT:    vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10279; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
10280; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10281; AVX-NEXT:    vmovaps 320(%rdi), %xmm2
10282; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10283; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10284; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10285; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
10286; AVX-NEXT:    vmovaps 416(%rdi), %xmm1
10287; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10288; AVX-NEXT:    vmovaps 384(%rdi), %xmm2
10289; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10290; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10291; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10292; AVX-NEXT:    vmovaps 480(%rdi), %xmm2
10293; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10294; AVX-NEXT:    vmovaps 448(%rdi), %xmm3
10295; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10296; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10297; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10298; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
10299; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
10300; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
10301; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10302; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10303; AVX-NEXT:    vmovaps 928(%rdi), %xmm1
10304; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10305; AVX-NEXT:    vmovaps 896(%rdi), %xmm0
10306; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10307; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
10308; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10309; AVX-NEXT:    vmovaps 992(%rdi), %xmm1
10310; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10311; AVX-NEXT:    vmovaps 960(%rdi), %xmm2
10312; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10313; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10314; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10315; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10316; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10317; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10318; AVX-NEXT:    vmovaps 800(%rdi), %xmm1
10319; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10320; AVX-NEXT:    vmovaps 768(%rdi), %xmm14
10321; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
10322; AVX-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10323; AVX-NEXT:    vmovaps 864(%rdi), %xmm2
10324; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10325; AVX-NEXT:    vmovaps 832(%rdi), %xmm3
10326; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10327; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10328; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10329; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10330; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10331; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10332; AVX-NEXT:    vmovaps 1440(%rdi), %xmm1
10333; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10334; AVX-NEXT:    vmovaps 1408(%rdi), %xmm0
10335; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10336; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
10337; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10338; AVX-NEXT:    vmovaps 1504(%rdi), %xmm1
10339; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10340; AVX-NEXT:    vmovaps 1472(%rdi), %xmm2
10341; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10342; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10343; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10344; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10345; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10346; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10347; AVX-NEXT:    vmovaps 1312(%rdi), %xmm2
10348; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10349; AVX-NEXT:    vmovaps 1280(%rdi), %xmm1
10350; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10351; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
10352; AVX-NEXT:    vmovaps 1376(%rdi), %xmm2
10353; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10354; AVX-NEXT:    vmovaps 1344(%rdi), %xmm3
10355; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10356; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10357; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10358; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10359; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10360; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10361; AVX-NEXT:    vmovaps 1952(%rdi), %xmm0
10362; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10363; AVX-NEXT:    vmovaps 1920(%rdi), %xmm1
10364; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10365; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10366; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10367; AVX-NEXT:    vmovaps 2016(%rdi), %xmm1
10368; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10369; AVX-NEXT:    vmovaps 1984(%rdi), %xmm2
10370; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10371; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10372; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10373; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10374; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10375; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10376; AVX-NEXT:    vmovaps 1824(%rdi), %xmm1
10377; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10378; AVX-NEXT:    vmovaps 1792(%rdi), %xmm5
10379; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
10380; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10381; AVX-NEXT:    vmovaps 1888(%rdi), %xmm2
10382; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10383; AVX-NEXT:    vmovaps 1856(%rdi), %xmm3
10384; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10385; AVX-NEXT:    vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10386; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
10387; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10388; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10389; AVX-NEXT:    vmovaps 160(%rdi), %xmm0
10390; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10391; AVX-NEXT:    vmovaps 128(%rdi), %xmm1
10392; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10393; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10394; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10395; AVX-NEXT:    vmovaps 224(%rdi), %xmm1
10396; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10397; AVX-NEXT:    vmovaps 192(%rdi), %xmm2
10398; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10399; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10400; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10401; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10402; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10403; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10404; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
10405; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
10406; AVX-NEXT:    vmovaps (%rdi), %xmm4
10407; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
10408; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10409; AVX-NEXT:    vmovaps 96(%rdi), %xmm2
10410; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10411; AVX-NEXT:    vmovaps 64(%rdi), %xmm3
10412; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10413; AVX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10414; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
10415; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10416; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10417; AVX-NEXT:    vmovaps 672(%rdi), %xmm1
10418; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10419; AVX-NEXT:    vmovaps 640(%rdi), %xmm0
10420; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10421; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
10422; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10423; AVX-NEXT:    vmovaps 736(%rdi), %xmm1
10424; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10425; AVX-NEXT:    vmovaps 704(%rdi), %xmm2
10426; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10427; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10428; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10429; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1]
10430; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
10431; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
10432; AVX-NEXT:    vmovaps 544(%rdi), %xmm0
10433; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10434; AVX-NEXT:    vmovaps 512(%rdi), %xmm2
10435; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
10436; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10437; AVX-NEXT:    vmovaps 608(%rdi), %xmm0
10438; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10439; AVX-NEXT:    vmovaps 576(%rdi), %xmm1
10440; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10441; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10442; AVX-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
10443; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7]
10444; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10445; AVX-NEXT:    vmovaps 1184(%rdi), %xmm0
10446; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10447; AVX-NEXT:    vmovaps 1152(%rdi), %xmm6
10448; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10449; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10450; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm7
10451; AVX-NEXT:    vmovaps 1248(%rdi), %xmm0
10452; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10453; AVX-NEXT:    vmovaps 1216(%rdi), %xmm6
10454; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10455; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10456; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10457; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm0[0,1,0,1]
10458; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
10459; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7]
10460; AVX-NEXT:    vmovaps 1056(%rdi), %xmm0
10461; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10462; AVX-NEXT:    vmovaps 1024(%rdi), %xmm6
10463; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10464; AVX-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10465; AVX-NEXT:    vmovaps 1120(%rdi), %xmm0
10466; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10467; AVX-NEXT:    vmovaps 1088(%rdi), %xmm6
10468; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10469; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10470; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10471; AVX-NEXT:    vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
10472; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7]
10473; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10474; AVX-NEXT:    vmovaps 1696(%rdi), %xmm0
10475; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10476; AVX-NEXT:    vmovaps 1664(%rdi), %xmm6
10477; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10478; AVX-NEXT:    vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10479; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm9
10480; AVX-NEXT:    vmovaps 1760(%rdi), %xmm0
10481; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10482; AVX-NEXT:    vmovaps 1728(%rdi), %xmm6
10483; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10484; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10485; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10486; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1]
10487; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
10488; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm11[6,7]
10489; AVX-NEXT:    vmovaps 1568(%rdi), %xmm12
10490; AVX-NEXT:    vmovaps 1536(%rdi), %xmm8
10491; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm12[0],xmm8[1],xmm12[1]
10492; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10493; AVX-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10494; AVX-NEXT:    vmovaps 1632(%rdi), %xmm6
10495; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10496; AVX-NEXT:    vmovaps 1600(%rdi), %xmm7
10497; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10498; AVX-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
10499; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0]
10500; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
10501; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10502; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1]
10503; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3]
10504; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10505; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
10506; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload
10507; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10508; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1]
10509; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10510; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm15[0],xmm6[1],xmm15[2,3]
10511; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
10512; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
10513; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
10514; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10515; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1]
10516; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10517; AVX-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10518; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10519; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
10520; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload
10521; AVX-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10522; AVX-NEXT:    # xmm15 = mem[1,1,1,1]
10523; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10524; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3]
10525; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
10526; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
10527; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
10528; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10529; AVX-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10530; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
10531; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10532; AVX-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10533; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10534; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
10535; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload
10536; AVX-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10537; AVX-NEXT:    # xmm15 = mem[1,1,1,1]
10538; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10539; AVX-NEXT:    # xmm15 = xmm15[0],mem[1],xmm15[2,3]
10540; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
10541; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
10542; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
10543; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10544; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
10545; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10546; AVX-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10547; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
10548; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
10549; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10550; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm11[1,1,1,1]
10551; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10552; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3]
10553; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
10554; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
10555; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
10556; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10557; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1]
10558; AVX-NEXT:    vblendps $2, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
10559; AVX-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10560; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
10561; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload
10562; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10563; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1]
10564; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10565; AVX-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
10566; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
10567; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
10568; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
10569; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10570; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1]
10571; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10572; AVX-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10573; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
10574; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
10575; AVX-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10576; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
10577; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10578; AVX-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
10579; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
10580; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10581; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10582; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10583; AVX-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10584; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
10585; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10586; AVX-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10587; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10588; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
10589; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
10590; AVX-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10591; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
10592; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10593; AVX-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
10594; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
10595; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10596; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10597; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10598; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
10599; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
10600; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
10601; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
10602; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10603; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1]
10604; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10605; AVX-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
10606; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
10607; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10608; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10609; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10610; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10611; AVX-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm13[2],xmm0[3],xmm13[3]
10612; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10613; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
10614; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10615; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10616; AVX-NEXT:    # xmm0 = mem[2,2,2,2]
10617; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10618; AVX-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
10619; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10620; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10621; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10622; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10623; AVX-NEXT:    # xmm1 = mem[2,2,2,2]
10624; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10625; AVX-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
10626; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
10627; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10628; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10629; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10630; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10631; AVX-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
10632; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10633; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10634; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
10635; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10636; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10637; AVX-NEXT:    # xmm0 = mem[2,2,2,2]
10638; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10639; AVX-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
10640; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10641; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10642; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10643; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10644; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
10645; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10646; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
10647; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
10648; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10649; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10650; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10651; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10652; AVX-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
10653; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10654; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10655; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10656; AVX-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
10657; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10658; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10659; AVX-NEXT:    # xmm0 = mem[2,2,2,2]
10660; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10661; AVX-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
10662; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10663; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10664; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10665; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10666; AVX-NEXT:    # xmm1 = mem[2,2,2,2]
10667; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10668; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
10669; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
10670; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10671; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10672; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10673; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10674; AVX-NEXT:    # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
10675; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10676; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
10677; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10678; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10679; AVX-NEXT:    # xmm0 = mem[2,2,2,2]
10680; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10681; AVX-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
10682; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10683; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10684; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10685; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10686; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2]
10687; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10688; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
10689; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
10690; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10691; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10692; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10693; AVX-NEXT:    vunpckhps (%rsp), %xmm0, %xmm8 # 16-byte Folded Reload
10694; AVX-NEXT:    # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
10695; AVX-NEXT:    vmovaps %xmm8, (%rsp) # 16-byte Spill
10696; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
10697; AVX-NEXT:    # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
10698; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10699; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10700; AVX-NEXT:    # xmm0 = mem[2,2,2,2]
10701; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10702; AVX-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
10703; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
10704; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10705; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10706; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10707; AVX-NEXT:    # xmm1 = mem[2,2,2,2]
10708; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10709; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
10710; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
10711; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10712; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10713; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10714; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
10715; AVX-NEXT:    # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
10716; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10717; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10718; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10719; AVX-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10720; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10721; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10722; AVX-NEXT:    # xmm15 = mem[2,2,2,2]
10723; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10724; AVX-NEXT:    # xmm15 = mem[0,1,2],xmm15[3]
10725; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
10726; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm14
10727; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
10728; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10729; AVX-NEXT:    # xmm15 = mem[2,2,2,2]
10730; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10731; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
10732; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3]
10733; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm14[4,5,6,7]
10734; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10735; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10736; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
10737; AVX-NEXT:    # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3]
10738; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10739; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10740; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10741; AVX-NEXT:    # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
10742; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10743; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
10744; AVX-NEXT:    # xmm13 = mem[2,2,2,2]
10745; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
10746; AVX-NEXT:    # xmm13 = mem[0,1,2],xmm13[3]
10747; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
10748; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm12
10749; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
10750; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10751; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2]
10752; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10753; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
10754; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
10755; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm12[4,5,6,7]
10756; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10757; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10758; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
10759; AVX-NEXT:    # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3]
10760; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10761; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload
10762; AVX-NEXT:    # xmm8 = xmm9[2],mem[2],xmm9[3],mem[3]
10763; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10764; AVX-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
10765; AVX-NEXT:    # xmm11 = mem[2,2,2,2]
10766; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10767; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
10768; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
10769; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm9
10770; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7]
10771; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10772; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm12[2,2,2,2]
10773; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10774; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
10775; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3]
10776; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm9[4,5,6,7]
10777; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10778; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10779; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload
10780; AVX-NEXT:    # xmm9 = xmm8[2],mem[2],xmm8[3],mem[3]
10781; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10782; AVX-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm9[1]
10783; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10784; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm11 # 16-byte Folded Reload
10785; AVX-NEXT:    # xmm11 = xmm8[2],mem[2],xmm8[3],mem[3]
10786; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
10787; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
10788; AVX-NEXT:    # xmm8 = mem[2,3,2,3]
10789; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
10790; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
10791; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
10792; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10793; AVX-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
10794; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10795; AVX-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm8[1]
10796; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10797; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload
10798; AVX-NEXT:    # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3]
10799; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
10800; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10801; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
10802; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
10803; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
10804; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
10805; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10806; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
10807; AVX-NEXT:    # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3]
10808; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10809; AVX-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm7[1]
10810; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10811; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload
10812; AVX-NEXT:    # xmm8 = xmm4[2],mem[2],xmm4[3],mem[3]
10813; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
10814; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
10815; AVX-NEXT:    # xmm6 = mem[2,3,2,3]
10816; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
10817; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
10818; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
10819; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10820; AVX-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
10821; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10822; AVX-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm6[1]
10823; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10824; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
10825; AVX-NEXT:    # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3]
10826; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
10827; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10828; AVX-NEXT:    # xmm5 = mem[2,3,2,3]
10829; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
10830; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
10831; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10832; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10833; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload
10834; AVX-NEXT:    # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
10835; AVX-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
10836; AVX-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
10837; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10838; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload
10839; AVX-NEXT:    # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
10840; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
10841; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10842; AVX-NEXT:    # xmm3 = mem[2,3,2,3]
10843; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
10844; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
10845; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
10846; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10847; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
10848; AVX-NEXT:    # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
10849; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10850; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
10851; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10852; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
10853; AVX-NEXT:    # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
10854; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
10855; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10856; AVX-NEXT:    # xmm1 = mem[2,3,2,3]
10857; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10858; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
10859; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10860; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10861; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
10862; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10863; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
10864; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10865; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10866; AVX-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
10867; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10868; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10869; AVX-NEXT:    # xmm2 = mem[2,3,2,3]
10870; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
10871; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10872; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10873; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10874; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3]
10875; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10876; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
10877; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
10878; AVX-NEXT:    # xmm1 = xmm13[2],mem[2],xmm13[3],mem[3]
10879; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10880; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10881; AVX-NEXT:    # xmm2 = mem[2,3,2,3]
10882; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
10883; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10884; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10885; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10886; AVX-NEXT:    vmovaps 416(%rdi), %ymm2
10887; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10888; AVX-NEXT:    vmovaps 384(%rdi), %ymm3
10889; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10890; AVX-NEXT:    vmovaps 448(%rdi), %ymm4
10891; AVX-NEXT:    vmovaps 480(%rdi), %ymm0
10892; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10893; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
10894; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
10895; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
10896; AVX-NEXT:    vmovaps 320(%rdi), %ymm6
10897; AVX-NEXT:    vmovaps 352(%rdi), %ymm13
10898; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm6[0],ymm13[2],ymm6[2]
10899; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10900; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10901; AVX-NEXT:    vmovaps 288(%rdi), %ymm8
10902; AVX-NEXT:    vmovaps 256(%rdi), %ymm7
10903; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5]
10904; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
10905; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
10906; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10907; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10908; AVX-NEXT:    vmovaps 672(%rdi), %ymm2
10909; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10910; AVX-NEXT:    vmovaps 640(%rdi), %ymm9
10911; AVX-NEXT:    vmovaps 704(%rdi), %ymm12
10912; AVX-NEXT:    vmovaps 736(%rdi), %ymm10
10913; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[2],ymm12[2]
10914; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5]
10915; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
10916; AVX-NEXT:    vmovaps 576(%rdi), %ymm14
10917; AVX-NEXT:    vmovaps 608(%rdi), %ymm11
10918; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
10919; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10920; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10921; AVX-NEXT:    vmovaps 544(%rdi), %ymm2
10922; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10923; AVX-NEXT:    vmovaps 512(%rdi), %ymm3
10924; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10925; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
10926; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
10927; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
10928; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10929; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10930; AVX-NEXT:    vmovaps 928(%rdi), %ymm2
10931; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10932; AVX-NEXT:    vmovaps 896(%rdi), %ymm3
10933; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10934; AVX-NEXT:    vmovaps 960(%rdi), %ymm1
10935; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10936; AVX-NEXT:    vmovaps 992(%rdi), %ymm0
10937; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10938; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10939; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
10940; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
10941; AVX-NEXT:    vmovaps 832(%rdi), %ymm2
10942; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10943; AVX-NEXT:    vmovaps 864(%rdi), %ymm1
10944; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10945; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
10946; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10947; AVX-NEXT:    vmovaps 800(%rdi), %ymm3
10948; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10949; AVX-NEXT:    vmovaps 768(%rdi), %ymm2
10950; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10951; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
10952; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
10953; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
10954; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10955; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10956; AVX-NEXT:    vmovaps 1184(%rdi), %ymm2
10957; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10958; AVX-NEXT:    vmovaps 1152(%rdi), %ymm1
10959; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10960; AVX-NEXT:    vmovaps 1216(%rdi), %ymm0
10961; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10962; AVX-NEXT:    vmovaps 1248(%rdi), %ymm3
10963; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10964; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
10965; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
10966; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
10967; AVX-NEXT:    vmovaps 1088(%rdi), %ymm2
10968; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10969; AVX-NEXT:    vmovaps 1120(%rdi), %ymm1
10970; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10971; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
10972; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10973; AVX-NEXT:    vmovaps 1056(%rdi), %ymm3
10974; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10975; AVX-NEXT:    vmovaps 1024(%rdi), %ymm2
10976; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10977; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
10978; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
10979; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
10980; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10981; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10982; AVX-NEXT:    vmovaps 1440(%rdi), %ymm2
10983; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10984; AVX-NEXT:    vmovaps 1408(%rdi), %ymm3
10985; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10986; AVX-NEXT:    vmovaps 1472(%rdi), %ymm1
10987; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10988; AVX-NEXT:    vmovaps 1504(%rdi), %ymm0
10989; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10990; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10991; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
10992; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
10993; AVX-NEXT:    vmovaps 1344(%rdi), %ymm2
10994; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10995; AVX-NEXT:    vmovaps 1376(%rdi), %ymm1
10996; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10997; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
10998; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10999; AVX-NEXT:    vmovaps 1312(%rdi), %ymm3
11000; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11001; AVX-NEXT:    vmovaps 1280(%rdi), %ymm2
11002; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11003; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
11004; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11005; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
11006; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11007; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11008; AVX-NEXT:    vmovaps 1696(%rdi), %ymm2
11009; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11010; AVX-NEXT:    vmovaps 1664(%rdi), %ymm3
11011; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11012; AVX-NEXT:    vmovaps 1728(%rdi), %ymm1
11013; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11014; AVX-NEXT:    vmovaps 1760(%rdi), %ymm0
11015; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11016; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11017; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
11018; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11019; AVX-NEXT:    vmovaps 1600(%rdi), %ymm2
11020; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11021; AVX-NEXT:    vmovaps 1632(%rdi), %ymm1
11022; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11023; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
11024; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
11025; AVX-NEXT:    vmovaps 1568(%rdi), %ymm3
11026; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11027; AVX-NEXT:    vmovaps 1536(%rdi), %ymm2
11028; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11029; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
11030; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11031; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
11032; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11033; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11034; AVX-NEXT:    vmovaps 1952(%rdi), %ymm2
11035; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11036; AVX-NEXT:    vmovaps 1920(%rdi), %ymm3
11037; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11038; AVX-NEXT:    vmovaps 1984(%rdi), %ymm1
11039; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11040; AVX-NEXT:    vmovaps 2016(%rdi), %ymm0
11041; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11042; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11043; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
11044; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11045; AVX-NEXT:    vmovaps 1856(%rdi), %ymm2
11046; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11047; AVX-NEXT:    vmovaps 1888(%rdi), %ymm1
11048; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11049; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
11050; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
11051; AVX-NEXT:    vmovaps 1824(%rdi), %ymm3
11052; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11053; AVX-NEXT:    vmovaps 1792(%rdi), %ymm2
11054; AVX-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
11055; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
11056; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11057; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
11058; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11059; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11060; AVX-NEXT:    vmovaps 160(%rdi), %ymm2
11061; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11062; AVX-NEXT:    vmovaps 128(%rdi), %ymm3
11063; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11064; AVX-NEXT:    vmovaps 192(%rdi), %ymm1
11065; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11066; AVX-NEXT:    vmovaps 224(%rdi), %ymm0
11067; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11068; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11069; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
11070; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11071; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
11072; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11073; AVX-NEXT:    vmovaps 96(%rdi), %ymm1
11074; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11075; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11076; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
11077; AVX-NEXT:    vmovaps (%rdi), %ymm1
11078; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11079; AVX-NEXT:    vmovaps 32(%rdi), %ymm3
11080; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11081; AVX-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
11082; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11083; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0]
11084; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
11085; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11086; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11087; AVX-NEXT:    vmovaps %ymm4, %ymm5
11088; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11089; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
11090; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11091; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11092; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm4[1,0],ymm1[5,4],ymm4[5,4]
11093; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11094; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11095; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[4],ymm13[4],ymm6[5],ymm13[5]
11096; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11097; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11098; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11099; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4]
11100; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11101; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11102; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11103; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11104; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11105; AVX-NEXT:    vmovaps %ymm12, %ymm13
11106; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11107; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5]
11108; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11109; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11110; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[1,0],ymm9[1,0],ymm12[5,4],ymm9[5,4]
11111; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11112; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11113; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[4],ymm11[4],ymm14[5],ymm11[5]
11114; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11115; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11116; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11117; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[1,0],ymm14[1,0],ymm15[5,4],ymm14[5,4]
11118; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11119; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11120; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11121; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11122; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11123; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11124; AVX-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11125; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11126; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11127; AVX-NEXT:    # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4]
11128; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11129; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11130; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11131; AVX-NEXT:    # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
11132; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11133; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11134; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11135; AVX-NEXT:    # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
11136; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11137; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11138; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11139; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11140; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11141; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11142; AVX-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11143; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11144; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11145; AVX-NEXT:    # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4]
11146; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11147; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11148; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11149; AVX-NEXT:    # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
11150; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11151; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11152; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11153; AVX-NEXT:    # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
11154; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11155; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11156; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11157; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11158; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11159; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11160; AVX-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11161; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11162; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11163; AVX-NEXT:    # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4]
11164; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11165; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11166; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11167; AVX-NEXT:    # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
11168; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11169; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11170; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11171; AVX-NEXT:    # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
11172; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11173; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11174; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11175; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11176; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11177; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11178; AVX-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11179; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11180; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11181; AVX-NEXT:    # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4]
11182; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11183; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11184; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11185; AVX-NEXT:    # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
11186; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11187; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11188; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11189; AVX-NEXT:    # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
11190; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11191; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11192; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11193; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11194; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11195; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11196; AVX-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11197; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11198; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11199; AVX-NEXT:    # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4]
11200; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11201; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11202; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11203; AVX-NEXT:    # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
11204; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11205; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11206; AVX-NEXT:    vshufps $17, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload
11207; AVX-NEXT:    # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
11208; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11209; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11210; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11211; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11212; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11213; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11214; AVX-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11215; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11216; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11217; AVX-NEXT:    # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4]
11218; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11219; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11220; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11221; AVX-NEXT:    # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
11222; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11223; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11224; AVX-NEXT:    vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11225; AVX-NEXT:    # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
11226; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11227; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11228; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11229; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11230; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
11231; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
11232; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
11233; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11234; AVX-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
11235; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11236; AVX-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7]
11237; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11238; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0]
11239; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11240; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11241; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm13[1],ymm10[3],ymm13[3]
11242; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[6],ymm12[6],ymm9[7],ymm12[7]
11243; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
11244; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
11245; AVX-NEXT:    # ymm2 = ymm11[1],mem[1],ymm11[3],mem[3]
11246; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11247; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11248; AVX-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7]
11249; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11250; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0]
11251; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11252; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11253; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11254; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11255; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
11256; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11257; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11258; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7]
11259; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
11260; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11261; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11262; AVX-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
11263; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11264; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11265; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11266; AVX-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7]
11267; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11268; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0]
11269; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11270; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11271; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11272; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
11273; AVX-NEXT:    # ymm0 = ymm13[1],mem[1],ymm13[3],mem[3]
11274; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11275; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11276; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7]
11277; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
11278; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11279; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11280; AVX-NEXT:    # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
11281; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11282; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11283; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11284; AVX-NEXT:    # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7]
11285; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11286; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0]
11287; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11288; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11289; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11290; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11291; AVX-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
11292; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11293; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11294; AVX-NEXT:    # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
11295; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
11296; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11297; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11298; AVX-NEXT:    # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
11299; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11300; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11301; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11302; AVX-NEXT:    # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7]
11303; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11304; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0]
11305; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11306; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11307; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11308; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11309; AVX-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
11310; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11311; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11312; AVX-NEXT:    # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
11313; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
11314; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11315; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11316; AVX-NEXT:    # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
11317; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11318; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11319; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11320; AVX-NEXT:    # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7]
11321; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11322; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0]
11323; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11324; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11325; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11326; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11327; AVX-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
11328; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11329; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11330; AVX-NEXT:    # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
11331; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
11332; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11333; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11334; AVX-NEXT:    # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
11335; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11336; AVX-NEXT:    vmovups (%rsp), %ymm15 # 32-byte Reload
11337; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11338; AVX-NEXT:    # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7]
11339; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11340; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0]
11341; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11342; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11343; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11344; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11345; AVX-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
11346; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11347; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11348; AVX-NEXT:    # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
11349; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
11350; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11351; AVX-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11352; AVX-NEXT:    # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
11353; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11354; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11355; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11356; AVX-NEXT:    # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7]
11357; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11358; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0]
11359; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11360; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11361; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11362; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11363; AVX-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11364; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11365; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11366; AVX-NEXT:    # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
11367; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11368; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11369; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7]
11370; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11371; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
11372; AVX-NEXT:    # ymm15 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4]
11373; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11374; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11375; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11376; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11377; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11378; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11379; AVX-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11380; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11381; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11382; AVX-NEXT:    # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
11383; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11384; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11385; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11386; AVX-NEXT:    # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
11387; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
11388; AVX-NEXT:    # ymm15 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11389; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11390; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11391; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11392; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11393; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11394; AVX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7]
11395; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm9[3,0],ymm3[7,4],ymm9[7,4]
11396; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11397; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7]
11398; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm7[3,0],ymm10[3,0],ymm7[7,4],ymm10[7,4]
11399; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11400; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11401; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11402; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11403; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11404; AVX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7]
11405; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[3,0],ymm14[3,0],ymm12[7,4],ymm14[7,4]
11406; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11407; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11408; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11409; AVX-NEXT:    # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11410; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11411; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
11412; AVX-NEXT:    # ymm15 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11413; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11414; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11415; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11416; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11417; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11418; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11419; AVX-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11420; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11421; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11422; AVX-NEXT:    # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11423; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11424; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11425; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11426; AVX-NEXT:    # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11427; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11428; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
11429; AVX-NEXT:    # ymm15 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11430; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11431; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11432; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11433; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11434; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11435; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11436; AVX-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11437; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11438; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11439; AVX-NEXT:    # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11440; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11441; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11442; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11443; AVX-NEXT:    # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11444; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11445; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
11446; AVX-NEXT:    # ymm15 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11447; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11448; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
11449; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3]
11450; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11451; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11452; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11453; AVX-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11454; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11455; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11456; AVX-NEXT:    # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11457; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11458; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11459; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11460; AVX-NEXT:    # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11461; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11462; AVX-NEXT:    vshufps $51, (%rsp), %ymm1, %ymm7 # 32-byte Folded Reload
11463; AVX-NEXT:    # ymm7 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11464; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11465; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
11466; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm7[2,0],xmm2[2,3]
11467; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11468; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11469; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11470; AVX-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11471; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11472; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11473; AVX-NEXT:    # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11474; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
11475; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11476; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
11477; AVX-NEXT:    # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11478; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11479; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11480; AVX-NEXT:    # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11481; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
11482; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
11483; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
11484; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11485; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11486; AVX-NEXT:    vmovaps %ymm1, 192(%rsi)
11487; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11488; AVX-NEXT:    vmovaps %ymm1, 128(%rsi)
11489; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11490; AVX-NEXT:    vmovaps %ymm1, 64(%rsi)
11491; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11492; AVX-NEXT:    vmovaps %ymm1, (%rsi)
11493; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11494; AVX-NEXT:    vmovaps %ymm1, 224(%rsi)
11495; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11496; AVX-NEXT:    vmovaps %ymm1, 160(%rsi)
11497; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11498; AVX-NEXT:    vmovaps %ymm1, 96(%rsi)
11499; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11500; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
11501; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11502; AVX-NEXT:    vmovaps %ymm1, 192(%rdx)
11503; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11504; AVX-NEXT:    vmovaps %ymm1, 128(%rdx)
11505; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11506; AVX-NEXT:    vmovaps %ymm1, 64(%rdx)
11507; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11508; AVX-NEXT:    vmovaps %ymm1, (%rdx)
11509; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11510; AVX-NEXT:    vmovaps %ymm1, 224(%rdx)
11511; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11512; AVX-NEXT:    vmovaps %ymm1, 160(%rdx)
11513; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11514; AVX-NEXT:    vmovaps %ymm1, 96(%rdx)
11515; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11516; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
11517; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11518; AVX-NEXT:    vmovaps %ymm1, 192(%rcx)
11519; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11520; AVX-NEXT:    vmovaps %ymm1, 128(%rcx)
11521; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11522; AVX-NEXT:    vmovaps %ymm1, 64(%rcx)
11523; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11524; AVX-NEXT:    vmovaps %ymm1, (%rcx)
11525; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11526; AVX-NEXT:    vmovaps %ymm1, 224(%rcx)
11527; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11528; AVX-NEXT:    vmovaps %ymm1, 160(%rcx)
11529; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11530; AVX-NEXT:    vmovaps %ymm1, 96(%rcx)
11531; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11532; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
11533; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11534; AVX-NEXT:    vmovaps %ymm1, 192(%r8)
11535; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11536; AVX-NEXT:    vmovaps %ymm1, 128(%r8)
11537; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11538; AVX-NEXT:    vmovaps %ymm1, 64(%r8)
11539; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11540; AVX-NEXT:    vmovaps %ymm1, (%r8)
11541; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11542; AVX-NEXT:    vmovaps %ymm1, 224(%r8)
11543; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11544; AVX-NEXT:    vmovaps %ymm1, 160(%r8)
11545; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11546; AVX-NEXT:    vmovaps %ymm1, 96(%r8)
11547; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11548; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
11549; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11550; AVX-NEXT:    vmovaps %ymm1, 224(%r9)
11551; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11552; AVX-NEXT:    vmovaps %ymm1, 192(%r9)
11553; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11554; AVX-NEXT:    vmovaps %ymm1, 160(%r9)
11555; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11556; AVX-NEXT:    vmovaps %ymm1, 128(%r9)
11557; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11558; AVX-NEXT:    vmovaps %ymm1, 96(%r9)
11559; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11560; AVX-NEXT:    vmovaps %ymm1, 64(%r9)
11561; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11562; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
11563; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11564; AVX-NEXT:    vmovaps %ymm1, (%r9)
11565; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11566; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11567; AVX-NEXT:    vmovaps %ymm1, 224(%rax)
11568; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11569; AVX-NEXT:    vmovaps %ymm1, 192(%rax)
11570; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11571; AVX-NEXT:    vmovaps %ymm1, 160(%rax)
11572; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11573; AVX-NEXT:    vmovaps %ymm1, 128(%rax)
11574; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11575; AVX-NEXT:    vmovaps %ymm1, 96(%rax)
11576; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11577; AVX-NEXT:    vmovaps %ymm1, 64(%rax)
11578; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11579; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
11580; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11581; AVX-NEXT:    vmovaps %ymm1, (%rax)
11582; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11583; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11584; AVX-NEXT:    vmovaps %ymm1, 224(%rax)
11585; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11586; AVX-NEXT:    vmovaps %ymm1, 192(%rax)
11587; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11588; AVX-NEXT:    vmovaps %ymm1, 160(%rax)
11589; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11590; AVX-NEXT:    vmovaps %ymm1, 128(%rax)
11591; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11592; AVX-NEXT:    vmovaps %ymm1, 96(%rax)
11593; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11594; AVX-NEXT:    vmovaps %ymm1, 64(%rax)
11595; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11596; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
11597; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11598; AVX-NEXT:    vmovaps %ymm1, (%rax)
11599; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11600; AVX-NEXT:    vmovaps %ymm7, 224(%rax)
11601; AVX-NEXT:    vmovaps %ymm15, 192(%rax)
11602; AVX-NEXT:    vmovaps %ymm6, 160(%rax)
11603; AVX-NEXT:    vmovaps %ymm5, 128(%rax)
11604; AVX-NEXT:    vmovaps %ymm3, 96(%rax)
11605; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11606; AVX-NEXT:    vmovaps %ymm1, 64(%rax)
11607; AVX-NEXT:    vmovaps %ymm4, 32(%rax)
11608; AVX-NEXT:    vmovaps %ymm0, (%rax)
11609; AVX-NEXT:    addq $3720, %rsp # imm = 0xE88
11610; AVX-NEXT:    vzeroupper
11611; AVX-NEXT:    retq
11612;
11613; AVX2-LABEL: load_i32_stride8_vf64:
11614; AVX2:       # %bb.0:
11615; AVX2-NEXT:    subq $3528, %rsp # imm = 0xDC8
11616; AVX2-NEXT:    vmovaps 288(%rdi), %xmm10
11617; AVX2-NEXT:    vmovaps 256(%rdi), %xmm0
11618; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11619; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
11620; AVX2-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11621; AVX2-NEXT:    vmovaps 352(%rdi), %xmm9
11622; AVX2-NEXT:    vbroadcastss %xmm9, %xmm1
11623; AVX2-NEXT:    vmovaps 320(%rdi), %xmm2
11624; AVX2-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
11625; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
11626; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11627; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11628; AVX2-NEXT:    vmovaps 416(%rdi), %xmm1
11629; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11630; AVX2-NEXT:    vmovaps 384(%rdi), %xmm2
11631; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11632; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11633; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11634; AVX2-NEXT:    vmovaps 480(%rdi), %xmm13
11635; AVX2-NEXT:    vbroadcastss %xmm13, %xmm2
11636; AVX2-NEXT:    vmovaps 448(%rdi), %xmm3
11637; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11638; AVX2-NEXT:    vbroadcastss %xmm3, %xmm3
11639; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11640; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11641; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
11642; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11643; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11644; AVX2-NEXT:    vmovaps 800(%rdi), %xmm0
11645; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11646; AVX2-NEXT:    vmovaps 768(%rdi), %xmm1
11647; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11648; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11649; AVX2-NEXT:    vmovaps 864(%rdi), %xmm12
11650; AVX2-NEXT:    vbroadcastss %xmm12, %xmm1
11651; AVX2-NEXT:    vmovaps 832(%rdi), %xmm2
11652; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11653; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
11654; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11655; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11656; AVX2-NEXT:    vmovaps 992(%rdi), %xmm1
11657; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11658; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11659; AVX2-NEXT:    vmovaps 960(%rdi), %xmm2
11660; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11661; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
11662; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11663; AVX2-NEXT:    vmovaps 928(%rdi), %xmm2
11664; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11665; AVX2-NEXT:    vmovaps 896(%rdi), %xmm3
11666; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11667; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11668; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11669; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11670; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11671; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11672; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11673; AVX2-NEXT:    vmovaps 1376(%rdi), %xmm0
11674; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11675; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
11676; AVX2-NEXT:    vmovaps 1344(%rdi), %xmm1
11677; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11678; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11679; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11680; AVX2-NEXT:    vmovaps 1312(%rdi), %xmm1
11681; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11682; AVX2-NEXT:    vmovaps 1280(%rdi), %xmm2
11683; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11684; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11685; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11686; AVX2-NEXT:    vmovaps 1504(%rdi), %xmm1
11687; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11688; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11689; AVX2-NEXT:    vmovaps 1472(%rdi), %xmm2
11690; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11691; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
11692; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11693; AVX2-NEXT:    vmovaps 1440(%rdi), %xmm2
11694; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11695; AVX2-NEXT:    vmovaps 1408(%rdi), %xmm3
11696; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11697; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11698; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11699; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11700; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11701; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11702; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11703; AVX2-NEXT:    vmovaps 1888(%rdi), %xmm0
11704; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11705; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
11706; AVX2-NEXT:    vmovaps 1856(%rdi), %xmm1
11707; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11708; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11709; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11710; AVX2-NEXT:    vmovaps 1824(%rdi), %xmm1
11711; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11712; AVX2-NEXT:    vmovaps 1792(%rdi), %xmm2
11713; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11714; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11715; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
11716; AVX2-NEXT:    vmovaps 2016(%rdi), %xmm0
11717; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11718; AVX2-NEXT:    vbroadcastss %xmm0, %xmm1
11719; AVX2-NEXT:    vmovaps 1984(%rdi), %xmm0
11720; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11721; AVX2-NEXT:    vbroadcastss %xmm0, %xmm2
11722; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11723; AVX2-NEXT:    vmovaps 1952(%rdi), %xmm0
11724; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11725; AVX2-NEXT:    vmovaps 1920(%rdi), %xmm2
11726; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11727; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
11728; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11729; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11730; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11731; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7]
11732; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11733; AVX2-NEXT:    vmovaps 608(%rdi), %xmm0
11734; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11735; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
11736; AVX2-NEXT:    vmovaps 576(%rdi), %xmm1
11737; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11738; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11739; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11740; AVX2-NEXT:    vmovaps 544(%rdi), %xmm2
11741; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11742; AVX2-NEXT:    vmovaps 512(%rdi), %xmm1
11743; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11744; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11745; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11746; AVX2-NEXT:    vmovaps 736(%rdi), %xmm1
11747; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11748; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11749; AVX2-NEXT:    vmovaps 704(%rdi), %xmm2
11750; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11751; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
11752; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11753; AVX2-NEXT:    vmovaps 672(%rdi), %xmm3
11754; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11755; AVX2-NEXT:    vmovaps 640(%rdi), %xmm2
11756; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11757; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
11758; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11759; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11760; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11761; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11762; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11763; AVX2-NEXT:    vmovaps 1120(%rdi), %xmm0
11764; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11765; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
11766; AVX2-NEXT:    vmovaps 1088(%rdi), %xmm1
11767; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11768; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11769; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11770; AVX2-NEXT:    vmovaps 1056(%rdi), %xmm2
11771; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11772; AVX2-NEXT:    vmovaps 1024(%rdi), %xmm1
11773; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11774; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11775; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11776; AVX2-NEXT:    vmovaps 1248(%rdi), %xmm1
11777; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11778; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11779; AVX2-NEXT:    vmovaps 1216(%rdi), %xmm2
11780; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11781; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
11782; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11783; AVX2-NEXT:    vmovaps 1184(%rdi), %xmm3
11784; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11785; AVX2-NEXT:    vmovaps 1152(%rdi), %xmm2
11786; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11787; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
11788; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11789; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11790; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11791; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11792; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11793; AVX2-NEXT:    vmovaps 1632(%rdi), %xmm0
11794; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11795; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
11796; AVX2-NEXT:    vmovaps 1600(%rdi), %xmm1
11797; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11798; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11799; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11800; AVX2-NEXT:    vmovaps 1568(%rdi), %xmm2
11801; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11802; AVX2-NEXT:    vmovaps 1536(%rdi), %xmm1
11803; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11804; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11805; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11806; AVX2-NEXT:    vmovaps 1760(%rdi), %xmm1
11807; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11808; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
11809; AVX2-NEXT:    vmovaps 1728(%rdi), %xmm2
11810; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11811; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
11812; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11813; AVX2-NEXT:    vmovaps 1696(%rdi), %xmm3
11814; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11815; AVX2-NEXT:    vmovaps 1664(%rdi), %xmm2
11816; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11817; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
11818; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11819; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11820; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11821; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11822; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11823; AVX2-NEXT:    vmovaps 224(%rdi), %xmm0
11824; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11825; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
11826; AVX2-NEXT:    vmovaps 192(%rdi), %xmm11
11827; AVX2-NEXT:    vbroadcastss %xmm11, %xmm1
11828; AVX2-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11829; AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11830; AVX2-NEXT:    vmovaps 160(%rdi), %xmm2
11831; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11832; AVX2-NEXT:    vmovaps 128(%rdi), %xmm1
11833; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11834; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11835; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11836; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
11837; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
11838; AVX2-NEXT:    vmovaps 96(%rdi), %xmm8
11839; AVX2-NEXT:    vbroadcastss %xmm8, %xmm1
11840; AVX2-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11841; AVX2-NEXT:    vmovaps 64(%rdi), %xmm7
11842; AVX2-NEXT:    vbroadcastss %xmm7, %xmm2
11843; AVX2-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11844; AVX2-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11845; AVX2-NEXT:    vmovaps (%rdi), %xmm5
11846; AVX2-NEXT:    vmovaps 32(%rdi), %xmm6
11847; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
11848; AVX2-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11849; AVX2-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11850; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
11851; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
11852; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11853; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11854; AVX2-NEXT:    # xmm0 = mem[1,1,1,1]
11855; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
11856; AVX2-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11857; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
11858; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
11859; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11860; AVX2-NEXT:    vmovaps %xmm13, %xmm9
11861; AVX2-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11862; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11863; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
11864; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11865; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11866; AVX2-NEXT:    # xmm2 = mem[1,1,1,1]
11867; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11868; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11869; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11870; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11871; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11872; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11873; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11874; AVX2-NEXT:    # xmm0 = mem[1,1,1,1]
11875; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11876; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11877; AVX2-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11878; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11879; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
11880; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11881; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11882; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11883; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
11884; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11885; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11886; AVX2-NEXT:    # xmm2 = mem[1,1,1,1]
11887; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11888; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11889; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11890; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11891; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11892; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11893; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11894; AVX2-NEXT:    # xmm0 = mem[1,1,1,1]
11895; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11896; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11897; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11898; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11899; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
11900; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11901; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11902; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11903; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
11904; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11905; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11906; AVX2-NEXT:    # xmm2 = mem[1,1,1,1]
11907; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11908; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11909; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11910; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11911; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11912; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11913; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11914; AVX2-NEXT:    # xmm0 = mem[1,1,1,1]
11915; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11916; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11917; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11918; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11919; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
11920; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11921; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11922; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11923; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
11924; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11925; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11926; AVX2-NEXT:    # xmm2 = mem[1,1,1,1]
11927; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11928; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11929; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11930; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11931; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11932; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11933; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
11934; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
11935; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
11936; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11937; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
11938; AVX2-NEXT:    # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
11939; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11940; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11941; AVX2-NEXT:    # xmm2 = mem[1,1,1,1]
11942; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11943; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11944; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11945; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11946; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11947; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11948; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11949; AVX2-NEXT:    # xmm0 = mem[1,1,1,1]
11950; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11951; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11952; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11953; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11954; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
11955; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11956; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11957; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11958; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
11959; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11960; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11961; AVX2-NEXT:    # xmm2 = mem[1,1,1,1]
11962; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11963; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11964; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11965; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11966; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11967; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11968; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11969; AVX2-NEXT:    # xmm0 = mem[1,1,1,1]
11970; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11971; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11972; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11973; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11974; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
11975; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11976; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11977; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11978; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
11979; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11980; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11981; AVX2-NEXT:    # xmm2 = mem[1,1,1,1]
11982; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11983; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11984; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11985; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11986; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11987; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11988; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11989; AVX2-NEXT:    # xmm0 = mem[1,1,1,1]
11990; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11991; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11992; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11993; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11994; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
11995; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11996; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11997; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11998; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
11999; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12000; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12001; AVX2-NEXT:    # xmm2 = mem[1,1,1,1]
12002; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
12003; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
12004; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
12005; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12006; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12007; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12008; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12009; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
12010; AVX2-NEXT:    # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3]
12011; AVX2-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12012; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12013; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12014; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12015; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12016; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2]
12017; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
12018; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12019; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12020; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12021; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12022; AVX2-NEXT:    # xmm1 = mem[2,2,2,2]
12023; AVX2-NEXT:    vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
12024; AVX2-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
12025; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
12026; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12027; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12028; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12029; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12030; AVX2-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
12031; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12032; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12033; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12034; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12035; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12036; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2]
12037; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
12038; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12039; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12040; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12041; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12042; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2]
12043; AVX2-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12044; AVX2-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
12045; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12046; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12047; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12048; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12049; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
12050; AVX2-NEXT:    # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3]
12051; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12052; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12053; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12054; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12055; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
12056; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
12057; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12058; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12059; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12060; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12061; AVX2-NEXT:    # xmm1 = mem[2,2,2,2]
12062; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12063; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
12064; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
12065; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12066; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12067; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12068; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
12069; AVX2-NEXT:    # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
12070; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12071; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12072; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12073; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12074; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12075; AVX2-NEXT:    # xmm0 = mem[2,2,2,2]
12076; AVX2-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12077; AVX2-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
12078; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12079; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12080; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12081; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12082; AVX2-NEXT:    # xmm1 = mem[2,2,2,2]
12083; AVX2-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12084; AVX2-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
12085; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
12086; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12087; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12088; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12089; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12090; AVX2-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
12091; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12092; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12093; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12094; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12095; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12096; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2]
12097; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
12098; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12099; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12100; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12101; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12102; AVX2-NEXT:    # xmm1 = mem[2,2,2,2]
12103; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12104; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
12105; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12106; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12107; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12108; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12109; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12110; AVX2-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
12111; AVX2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12112; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12113; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12114; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12115; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12116; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2]
12117; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
12118; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12119; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12120; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12121; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12122; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
12123; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12124; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
12125; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12126; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12127; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12128; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12129; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
12130; AVX2-NEXT:    # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
12131; AVX2-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12132; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12133; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12134; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12135; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12136; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12137; AVX2-NEXT:    # xmm0 = mem[2,2,2,2]
12138; AVX2-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12139; AVX2-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
12140; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12141; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm15
12142; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7]
12143; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
12144; AVX2-NEXT:    vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2]
12145; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12146; AVX2-NEXT:    vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
12147; AVX2-NEXT:    vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3]
12148; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12149; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12150; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12151; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
12152; AVX2-NEXT:    # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
12153; AVX2-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12154; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
12155; AVX2-NEXT:    # xmm15 = mem[2,2,2,2]
12156; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12157; AVX2-NEXT:    vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3]
12158; AVX2-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
12159; AVX2-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm14
12160; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
12161; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12162; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
12163; AVX2-NEXT:    # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3]
12164; AVX2-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
12165; AVX2-NEXT:    # xmm13 = mem[2,2,2,2]
12166; AVX2-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
12167; AVX2-NEXT:    # xmm13 = mem[0,1,2],xmm13[3]
12168; AVX2-NEXT:    vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
12169; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7]
12170; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12171; AVX2-NEXT:    vmovaps (%rsp), %xmm5 # 16-byte Reload
12172; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
12173; AVX2-NEXT:    # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
12174; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12175; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1]
12176; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12177; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
12178; AVX2-NEXT:    # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
12179; AVX2-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
12180; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
12181; AVX2-NEXT:    # xmm12 = mem[2,3,2,3]
12182; AVX2-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
12183; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
12184; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
12185; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12186; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12187; AVX2-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
12188; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12189; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1]
12190; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12191; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload
12192; AVX2-NEXT:    # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3]
12193; AVX2-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
12194; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
12195; AVX2-NEXT:    # xmm13 = mem[2,3,2,3]
12196; AVX2-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
12197; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
12198; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
12199; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12200; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload
12201; AVX2-NEXT:    # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3]
12202; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1]
12203; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12204; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload
12205; AVX2-NEXT:    # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3]
12206; AVX2-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
12207; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
12208; AVX2-NEXT:    # xmm10 = mem[2,3,2,3]
12209; AVX2-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
12210; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
12211; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
12212; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12213; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12214; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
12215; AVX2-NEXT:    # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
12216; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1]
12217; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12218; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
12219; AVX2-NEXT:    # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
12220; AVX2-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
12221; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
12222; AVX2-NEXT:    # xmm8 = mem[2,3,2,3]
12223; AVX2-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
12224; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
12225; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
12226; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12227; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
12228; AVX2-NEXT:    # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
12229; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12230; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
12231; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12232; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
12233; AVX2-NEXT:    # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
12234; AVX2-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
12235; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
12236; AVX2-NEXT:    # xmm6 = mem[2,3,2,3]
12237; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
12238; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
12239; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
12240; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12241; AVX2-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
12242; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12243; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
12244; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12245; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
12246; AVX2-NEXT:    # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3]
12247; AVX2-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
12248; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
12249; AVX2-NEXT:    # xmm4 = mem[2,3,2,3]
12250; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
12251; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
12252; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
12253; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12254; AVX2-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12255; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12256; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
12257; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
12258; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
12259; AVX2-NEXT:    # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3]
12260; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
12261; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12262; AVX2-NEXT:    # xmm2 = mem[2,3,2,3]
12263; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
12264; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
12265; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12266; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12267; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12268; AVX2-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12269; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12270; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12271; AVX2-NEXT:    # xmm0 = mem[2,3,2,3]
12272; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12273; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
12274; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12275; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12276; AVX2-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
12277; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1]
12278; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12279; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12280; AVX2-NEXT:    vmovaps 32(%rdi), %ymm0
12281; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12282; AVX2-NEXT:    vmovaps (%rdi), %ymm1
12283; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12284; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12285; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
12286; AVX2-NEXT:    vmovaps 96(%rdi), %ymm1
12287; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12288; AVX2-NEXT:    vmovaps 64(%rdi), %ymm2
12289; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12290; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12291; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12292; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12293; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12294; AVX2-NEXT:    vmovaps 224(%rdi), %ymm2
12295; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12296; AVX2-NEXT:    vmovaps 192(%rdi), %ymm3
12297; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12298; AVX2-NEXT:    vmovaps 160(%rdi), %ymm15
12299; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
12300; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12301; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
12302; AVX2-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12303; AVX2-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
12304; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12305; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
12306; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12307; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12308; AVX2-NEXT:    vmovaps 288(%rdi), %ymm0
12309; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12310; AVX2-NEXT:    vmovaps 256(%rdi), %ymm1
12311; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12312; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12313; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
12314; AVX2-NEXT:    vmovaps 352(%rdi), %ymm1
12315; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12316; AVX2-NEXT:    vmovaps 320(%rdi), %ymm2
12317; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12318; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12319; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12320; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12321; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12322; AVX2-NEXT:    vmovaps 480(%rdi), %ymm2
12323; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12324; AVX2-NEXT:    vmovaps 448(%rdi), %ymm3
12325; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12326; AVX2-NEXT:    vmovaps 416(%rdi), %ymm4
12327; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12328; AVX2-NEXT:    vmovaps 384(%rdi), %ymm1
12329; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12330; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
12331; AVX2-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
12332; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
12333; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12334; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12335; AVX2-NEXT:    vmovaps 544(%rdi), %ymm0
12336; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12337; AVX2-NEXT:    vmovaps 512(%rdi), %ymm1
12338; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12339; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12340; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
12341; AVX2-NEXT:    vmovaps 608(%rdi), %ymm1
12342; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12343; AVX2-NEXT:    vmovaps 576(%rdi), %ymm2
12344; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12345; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12346; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12347; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12348; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12349; AVX2-NEXT:    vmovaps 736(%rdi), %ymm2
12350; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12351; AVX2-NEXT:    vmovaps 704(%rdi), %ymm3
12352; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12353; AVX2-NEXT:    vmovaps 672(%rdi), %ymm4
12354; AVX2-NEXT:    vmovaps 640(%rdi), %ymm1
12355; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12356; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
12357; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12358; AVX2-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
12359; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
12360; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12361; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12362; AVX2-NEXT:    vmovaps 800(%rdi), %ymm0
12363; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12364; AVX2-NEXT:    vmovaps 768(%rdi), %ymm1
12365; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12366; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12367; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
12368; AVX2-NEXT:    vmovaps 864(%rdi), %ymm1
12369; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12370; AVX2-NEXT:    vmovaps 832(%rdi), %ymm2
12371; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12372; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12373; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12374; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12375; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12376; AVX2-NEXT:    vmovaps 992(%rdi), %ymm2
12377; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12378; AVX2-NEXT:    vmovaps 960(%rdi), %ymm5
12379; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12380; AVX2-NEXT:    vmovaps 928(%rdi), %ymm3
12381; AVX2-NEXT:    vmovaps 896(%rdi), %ymm1
12382; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12383; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
12384; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12385; AVX2-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
12386; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
12387; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12388; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12389; AVX2-NEXT:    vmovaps 1056(%rdi), %ymm0
12390; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12391; AVX2-NEXT:    vmovaps 1024(%rdi), %ymm1
12392; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12393; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12394; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
12395; AVX2-NEXT:    vmovaps 1120(%rdi), %ymm1
12396; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12397; AVX2-NEXT:    vmovaps 1088(%rdi), %ymm2
12398; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12399; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12400; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12401; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12402; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12403; AVX2-NEXT:    vmovaps 1248(%rdi), %ymm1
12404; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12405; AVX2-NEXT:    vmovaps 1216(%rdi), %ymm7
12406; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12407; AVX2-NEXT:    vmovaps 1184(%rdi), %ymm2
12408; AVX2-NEXT:    vmovaps 1152(%rdi), %ymm6
12409; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12410; AVX2-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5]
12411; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12412; AVX2-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5]
12413; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
12414; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
12415; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12416; AVX2-NEXT:    vmovaps 1312(%rdi), %ymm0
12417; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12418; AVX2-NEXT:    vmovaps 1280(%rdi), %ymm6
12419; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12420; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5]
12421; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm6
12422; AVX2-NEXT:    vmovaps 1376(%rdi), %ymm0
12423; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12424; AVX2-NEXT:    vmovaps 1344(%rdi), %ymm7
12425; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12426; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5]
12427; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12428; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2]
12429; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3]
12430; AVX2-NEXT:    vmovaps 1504(%rdi), %ymm6
12431; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12432; AVX2-NEXT:    vmovaps 1472(%rdi), %ymm10
12433; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12434; AVX2-NEXT:    vmovaps 1440(%rdi), %ymm0
12435; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
12436; AVX2-NEXT:    vmovaps 1408(%rdi), %ymm9
12437; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12438; AVX2-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
12439; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
12440; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
12441; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
12442; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12443; AVX2-NEXT:    vmovaps 1568(%rdi), %ymm6
12444; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12445; AVX2-NEXT:    vmovaps 1536(%rdi), %ymm7
12446; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12447; AVX2-NEXT:    vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
12448; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm9
12449; AVX2-NEXT:    vmovaps 1632(%rdi), %ymm6
12450; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12451; AVX2-NEXT:    vmovaps 1600(%rdi), %ymm7
12452; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12453; AVX2-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
12454; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12455; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2]
12456; AVX2-NEXT:    vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3]
12457; AVX2-NEXT:    vmovaps 1760(%rdi), %ymm9
12458; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12459; AVX2-NEXT:    vmovaps 1728(%rdi), %ymm6
12460; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12461; AVX2-NEXT:    vmovaps 1696(%rdi), %ymm7
12462; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12463; AVX2-NEXT:    vmovaps 1664(%rdi), %ymm11
12464; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12465; AVX2-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5]
12466; AVX2-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
12467; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12468; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2]
12469; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
12470; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12471; AVX2-NEXT:    vmovaps 1824(%rdi), %ymm6
12472; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12473; AVX2-NEXT:    vmovaps 1792(%rdi), %ymm7
12474; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12475; AVX2-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
12476; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm11
12477; AVX2-NEXT:    vmovaps 1888(%rdi), %ymm6
12478; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12479; AVX2-NEXT:    vmovaps 1856(%rdi), %ymm7
12480; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12481; AVX2-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
12482; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12483; AVX2-NEXT:    vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2]
12484; AVX2-NEXT:    vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3]
12485; AVX2-NEXT:    vmovaps 2016(%rdi), %ymm11
12486; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12487; AVX2-NEXT:    vmovaps 1984(%rdi), %ymm6
12488; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12489; AVX2-NEXT:    vmovaps 1952(%rdi), %ymm7
12490; AVX2-NEXT:    vmovaps 1920(%rdi), %ymm9
12491; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12492; AVX2-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
12493; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12494; AVX2-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5]
12495; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2]
12496; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
12497; AVX2-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12498; AVX2-NEXT:    vbroadcastss 148(%rdi), %ymm13
12499; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
12500; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
12501; AVX2-NEXT:    # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
12502; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12503; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm14
12504; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12505; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5]
12506; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12507; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7]
12508; AVX2-NEXT:    vextractf128 $1, %ymm15, %xmm15
12509; AVX2-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
12510; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
12511; AVX2-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12512; AVX2-NEXT:    vbroadcastss 404(%rdi), %ymm13
12513; AVX2-NEXT:    vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
12514; AVX2-NEXT:    # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7]
12515; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
12516; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12517; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm13
12518; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12519; AVX2-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5]
12520; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12521; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
12522; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm14
12523; AVX2-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
12524; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
12525; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12526; AVX2-NEXT:    vbroadcastss 660(%rdi), %ymm12
12527; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
12528; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7]
12529; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12530; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
12531; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12532; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5]
12533; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12534; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
12535; AVX2-NEXT:    vextractf128 $1, %ymm12, %xmm12
12536; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
12537; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
12538; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12539; AVX2-NEXT:    vbroadcastss 916(%rdi), %ymm4
12540; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
12541; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7]
12542; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12543; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
12544; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12545; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5]
12546; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12547; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7]
12548; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
12549; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12550; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
12551; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12552; AVX2-NEXT:    vbroadcastss 1172(%rdi), %ymm3
12553; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12554; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
12555; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12556; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
12557; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12558; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5]
12559; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12560; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
12561; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
12562; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
12563; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12564; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12565; AVX2-NEXT:    vbroadcastss 1428(%rdi), %ymm1
12566; AVX2-NEXT:    vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
12567; AVX2-NEXT:    # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7]
12568; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12569; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12570; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
12571; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12572; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
12573; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12574; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
12575; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
12576; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
12577; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12578; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12579; AVX2-NEXT:    vbroadcastss 1684(%rdi), %ymm0
12580; AVX2-NEXT:    vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12581; AVX2-NEXT:    # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7]
12582; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12583; AVX2-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
12584; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12585; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
12586; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12587; AVX2-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
12588; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12589; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
12590; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
12591; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12592; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12593; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12594; AVX2-NEXT:    vbroadcastss 1940(%rdi), %ymm0
12595; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
12596; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
12597; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12598; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
12599; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12600; AVX2-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
12601; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12602; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
12603; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
12604; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12605; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12606; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12607; AVX2-NEXT:    vbroadcastss 248(%rdi), %ymm0
12608; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12609; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12610; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12611; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
12612; AVX2-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12613; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12614; AVX2-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
12615; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12616; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12617; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12618; AVX2-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12619; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12620; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
12621; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2]
12622; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12623; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
12624; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12625; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12626; AVX2-NEXT:    vbroadcastss 504(%rdi), %ymm0
12627; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12628; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12629; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12630; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12631; AVX2-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12632; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12633; AVX2-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7]
12634; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12635; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12636; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
12637; AVX2-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12638; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12639; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
12640; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
12641; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12642; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
12643; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12644; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12645; AVX2-NEXT:    vbroadcastss 760(%rdi), %ymm0
12646; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12647; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12648; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12649; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12650; AVX2-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12651; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12652; AVX2-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
12653; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12654; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12655; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
12656; AVX2-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12657; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12658; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
12659; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
12660; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12661; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
12662; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12663; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12664; AVX2-NEXT:    vbroadcastss 1016(%rdi), %ymm0
12665; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12666; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12667; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12668; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12669; AVX2-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12670; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12671; AVX2-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7]
12672; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12673; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12674; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
12675; AVX2-NEXT:    # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12676; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
12677; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2]
12678; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12679; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
12680; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12681; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12682; AVX2-NEXT:    vbroadcastss 1272(%rdi), %ymm0
12683; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12684; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12685; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12686; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload
12687; AVX2-NEXT:    # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12688; AVX2-NEXT:    vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
12689; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12690; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
12691; AVX2-NEXT:    # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12692; AVX2-NEXT:    vextractf128 $1, %ymm13, %xmm1
12693; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2]
12694; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12695; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
12696; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12697; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12698; AVX2-NEXT:    vbroadcastss 1528(%rdi), %ymm0
12699; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12700; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12701; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12702; AVX2-NEXT:    vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
12703; AVX2-NEXT:    # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12704; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
12705; AVX2-NEXT:    # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
12706; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12707; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
12708; AVX2-NEXT:    # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12709; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm1
12710; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2]
12711; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12712; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
12713; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12714; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12715; AVX2-NEXT:    vbroadcastss 1784(%rdi), %ymm0
12716; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12717; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12718; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12719; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12720; AVX2-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12721; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12722; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
12723; AVX2-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12724; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12725; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
12726; AVX2-NEXT:    # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12727; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm1
12728; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2]
12729; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12730; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
12731; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12732; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12733; AVX2-NEXT:    vbroadcastss 2040(%rdi), %ymm0
12734; AVX2-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
12735; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
12736; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12737; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12738; AVX2-NEXT:    # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12739; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12740; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
12741; AVX2-NEXT:    # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12742; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12743; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
12744; AVX2-NEXT:    # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12745; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm0
12746; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2]
12747; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
12748; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
12749; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12750; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
12751; AVX2-NEXT:    vbroadcastss 220(%rdi), %ymm0
12752; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12753; AVX2-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12754; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12755; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12756; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12757; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
12758; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12759; AVX2-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
12760; AVX2-NEXT:    vextractf128 $1, %ymm15, %xmm15
12761; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
12762; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12763; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12764; AVX2-NEXT:    vbroadcastss 476(%rdi), %ymm0
12765; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12766; AVX2-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12767; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12768; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12769; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12770; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
12771; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12772; AVX2-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
12773; AVX2-NEXT:    vextractf128 $1, %ymm15, %xmm15
12774; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
12775; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12776; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12777; AVX2-NEXT:    vbroadcastss 732(%rdi), %ymm0
12778; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12779; AVX2-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12780; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12781; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12782; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12783; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
12784; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12785; AVX2-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
12786; AVX2-NEXT:    vextractf128 $1, %ymm15, %xmm15
12787; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
12788; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12789; AVX2-NEXT:    vbroadcastss 988(%rdi), %ymm0
12790; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12791; AVX2-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12792; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12793; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12794; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm1
12795; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12796; AVX2-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
12797; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm14
12798; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
12799; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12800; AVX2-NEXT:    vbroadcastss 1244(%rdi), %ymm0
12801; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12802; AVX2-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12803; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
12804; AVX2-NEXT:    vextractf128 $1, %ymm11, %xmm1
12805; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7]
12806; AVX2-NEXT:    vextractf128 $1, %ymm11, %xmm11
12807; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
12808; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12809; AVX2-NEXT:    vbroadcastss 1500(%rdi), %ymm0
12810; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12811; AVX2-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12812; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3]
12813; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
12814; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
12815; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm9
12816; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
12817; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
12818; AVX2-NEXT:    vbroadcastss 1756(%rdi), %ymm0
12819; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12820; AVX2-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12821; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3]
12822; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
12823; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7]
12824; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
12825; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
12826; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12827; AVX2-NEXT:    vbroadcastss 2012(%rdi), %ymm0
12828; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12829; AVX2-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12830; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3]
12831; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
12832; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
12833; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
12834; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
12835; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12836; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12837; AVX2-NEXT:    vmovaps %ymm2, 192(%rsi)
12838; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12839; AVX2-NEXT:    vmovaps %ymm2, 128(%rsi)
12840; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12841; AVX2-NEXT:    vmovaps %ymm2, 64(%rsi)
12842; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12843; AVX2-NEXT:    vmovaps %ymm2, (%rsi)
12844; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12845; AVX2-NEXT:    vmovaps %ymm2, 224(%rsi)
12846; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12847; AVX2-NEXT:    vmovaps %ymm2, 160(%rsi)
12848; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12849; AVX2-NEXT:    vmovaps %ymm2, 96(%rsi)
12850; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12851; AVX2-NEXT:    vmovaps %ymm2, 32(%rsi)
12852; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12853; AVX2-NEXT:    vmovaps %ymm2, 192(%rdx)
12854; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12855; AVX2-NEXT:    vmovaps %ymm2, 128(%rdx)
12856; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12857; AVX2-NEXT:    vmovaps %ymm2, 64(%rdx)
12858; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12859; AVX2-NEXT:    vmovaps %ymm2, (%rdx)
12860; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12861; AVX2-NEXT:    vmovaps %ymm2, 224(%rdx)
12862; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12863; AVX2-NEXT:    vmovaps %ymm2, 160(%rdx)
12864; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12865; AVX2-NEXT:    vmovaps %ymm2, 96(%rdx)
12866; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12867; AVX2-NEXT:    vmovaps %ymm2, 32(%rdx)
12868; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12869; AVX2-NEXT:    vmovaps %ymm2, 192(%rcx)
12870; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12871; AVX2-NEXT:    vmovaps %ymm2, 128(%rcx)
12872; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12873; AVX2-NEXT:    vmovaps %ymm2, 64(%rcx)
12874; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12875; AVX2-NEXT:    vmovaps %ymm2, (%rcx)
12876; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12877; AVX2-NEXT:    vmovaps %ymm2, 224(%rcx)
12878; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12879; AVX2-NEXT:    vmovaps %ymm2, 160(%rcx)
12880; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12881; AVX2-NEXT:    vmovaps %ymm2, 96(%rcx)
12882; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12883; AVX2-NEXT:    vmovaps %ymm2, 32(%rcx)
12884; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12885; AVX2-NEXT:    vmovaps %ymm2, 192(%r8)
12886; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12887; AVX2-NEXT:    vmovaps %ymm2, 128(%r8)
12888; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12889; AVX2-NEXT:    vmovaps %ymm2, 64(%r8)
12890; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12891; AVX2-NEXT:    vmovaps %ymm2, (%r8)
12892; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12893; AVX2-NEXT:    vmovaps %ymm2, 224(%r8)
12894; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12895; AVX2-NEXT:    vmovaps %ymm2, 160(%r8)
12896; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12897; AVX2-NEXT:    vmovaps %ymm2, 96(%r8)
12898; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12899; AVX2-NEXT:    vmovaps %ymm2, 32(%r8)
12900; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12901; AVX2-NEXT:    vmovaps %ymm2, 224(%r9)
12902; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12903; AVX2-NEXT:    vmovaps %ymm2, 192(%r9)
12904; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12905; AVX2-NEXT:    vmovaps %ymm2, 160(%r9)
12906; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12907; AVX2-NEXT:    vmovaps %ymm2, 128(%r9)
12908; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12909; AVX2-NEXT:    vmovaps %ymm2, 96(%r9)
12910; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12911; AVX2-NEXT:    vmovaps %ymm2, 64(%r9)
12912; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12913; AVX2-NEXT:    vmovaps %ymm2, 32(%r9)
12914; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12915; AVX2-NEXT:    vmovaps %ymm2, (%r9)
12916; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12917; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12918; AVX2-NEXT:    vmovaps %ymm2, 224(%rax)
12919; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12920; AVX2-NEXT:    vmovaps %ymm2, 192(%rax)
12921; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12922; AVX2-NEXT:    vmovaps %ymm2, 160(%rax)
12923; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12924; AVX2-NEXT:    vmovaps %ymm2, 128(%rax)
12925; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12926; AVX2-NEXT:    vmovaps %ymm2, 96(%rax)
12927; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12928; AVX2-NEXT:    vmovaps %ymm2, 64(%rax)
12929; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12930; AVX2-NEXT:    vmovaps %ymm2, 32(%rax)
12931; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12932; AVX2-NEXT:    vmovaps %ymm2, (%rax)
12933; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12934; AVX2-NEXT:    vmovups (%rsp), %ymm2 # 32-byte Reload
12935; AVX2-NEXT:    vmovaps %ymm2, 224(%rax)
12936; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12937; AVX2-NEXT:    vmovaps %ymm2, 192(%rax)
12938; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12939; AVX2-NEXT:    vmovaps %ymm2, 160(%rax)
12940; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12941; AVX2-NEXT:    vmovaps %ymm2, 128(%rax)
12942; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12943; AVX2-NEXT:    vmovaps %ymm2, 96(%rax)
12944; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12945; AVX2-NEXT:    vmovaps %ymm2, 64(%rax)
12946; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12947; AVX2-NEXT:    vmovaps %ymm2, 32(%rax)
12948; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12949; AVX2-NEXT:    vmovaps %ymm2, (%rax)
12950; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12951; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
12952; AVX2-NEXT:    vmovaps %ymm5, 192(%rax)
12953; AVX2-NEXT:    vmovaps %ymm8, 160(%rax)
12954; AVX2-NEXT:    vmovaps %ymm1, 128(%rax)
12955; AVX2-NEXT:    vmovaps %ymm14, 96(%rax)
12956; AVX2-NEXT:    vmovaps %ymm15, 64(%rax)
12957; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12958; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
12959; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12960; AVX2-NEXT:    vmovaps %ymm0, (%rax)
12961; AVX2-NEXT:    addq $3528, %rsp # imm = 0xDC8
12962; AVX2-NEXT:    vzeroupper
12963; AVX2-NEXT:    retq
12964;
12965; AVX2-FP-LABEL: load_i32_stride8_vf64:
12966; AVX2-FP:       # %bb.0:
12967; AVX2-FP-NEXT:    subq $3528, %rsp # imm = 0xDC8
12968; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm10
12969; AVX2-FP-NEXT:    vmovaps 256(%rdi), %xmm0
12970; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12971; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
12972; AVX2-FP-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12973; AVX2-FP-NEXT:    vmovaps 352(%rdi), %xmm9
12974; AVX2-FP-NEXT:    vbroadcastss %xmm9, %xmm1
12975; AVX2-FP-NEXT:    vmovaps 320(%rdi), %xmm2
12976; AVX2-FP-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
12977; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
12978; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
12979; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12980; AVX2-FP-NEXT:    vmovaps 416(%rdi), %xmm1
12981; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12982; AVX2-FP-NEXT:    vmovaps 384(%rdi), %xmm2
12983; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12984; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
12985; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
12986; AVX2-FP-NEXT:    vmovaps 480(%rdi), %xmm13
12987; AVX2-FP-NEXT:    vbroadcastss %xmm13, %xmm2
12988; AVX2-FP-NEXT:    vmovaps 448(%rdi), %xmm3
12989; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12990; AVX2-FP-NEXT:    vbroadcastss %xmm3, %xmm3
12991; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
12992; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
12993; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
12994; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12995; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12996; AVX2-FP-NEXT:    vmovaps 800(%rdi), %xmm0
12997; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12998; AVX2-FP-NEXT:    vmovaps 768(%rdi), %xmm1
12999; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13000; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13001; AVX2-FP-NEXT:    vmovaps 864(%rdi), %xmm12
13002; AVX2-FP-NEXT:    vbroadcastss %xmm12, %xmm1
13003; AVX2-FP-NEXT:    vmovaps 832(%rdi), %xmm2
13004; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13005; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
13006; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13007; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13008; AVX2-FP-NEXT:    vmovaps 992(%rdi), %xmm1
13009; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13010; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13011; AVX2-FP-NEXT:    vmovaps 960(%rdi), %xmm2
13012; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13013; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
13014; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13015; AVX2-FP-NEXT:    vmovaps 928(%rdi), %xmm2
13016; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13017; AVX2-FP-NEXT:    vmovaps 896(%rdi), %xmm3
13018; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13019; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
13020; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13021; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13022; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13023; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13024; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13025; AVX2-FP-NEXT:    vmovaps 1376(%rdi), %xmm0
13026; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13027; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm0
13028; AVX2-FP-NEXT:    vmovaps 1344(%rdi), %xmm1
13029; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13030; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13031; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13032; AVX2-FP-NEXT:    vmovaps 1312(%rdi), %xmm1
13033; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13034; AVX2-FP-NEXT:    vmovaps 1280(%rdi), %xmm2
13035; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13036; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13037; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13038; AVX2-FP-NEXT:    vmovaps 1504(%rdi), %xmm1
13039; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13040; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13041; AVX2-FP-NEXT:    vmovaps 1472(%rdi), %xmm2
13042; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13043; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
13044; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13045; AVX2-FP-NEXT:    vmovaps 1440(%rdi), %xmm2
13046; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13047; AVX2-FP-NEXT:    vmovaps 1408(%rdi), %xmm3
13048; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13049; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
13050; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13051; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13052; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13053; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13054; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13055; AVX2-FP-NEXT:    vmovaps 1888(%rdi), %xmm0
13056; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13057; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm0
13058; AVX2-FP-NEXT:    vmovaps 1856(%rdi), %xmm1
13059; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13060; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13061; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13062; AVX2-FP-NEXT:    vmovaps 1824(%rdi), %xmm1
13063; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13064; AVX2-FP-NEXT:    vmovaps 1792(%rdi), %xmm2
13065; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13066; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13067; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
13068; AVX2-FP-NEXT:    vmovaps 2016(%rdi), %xmm0
13069; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13070; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm1
13071; AVX2-FP-NEXT:    vmovaps 1984(%rdi), %xmm0
13072; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13073; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm2
13074; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13075; AVX2-FP-NEXT:    vmovaps 1952(%rdi), %xmm0
13076; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13077; AVX2-FP-NEXT:    vmovaps 1920(%rdi), %xmm2
13078; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13079; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
13080; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13081; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13082; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13083; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7]
13084; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13085; AVX2-FP-NEXT:    vmovaps 608(%rdi), %xmm0
13086; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13087; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm0
13088; AVX2-FP-NEXT:    vmovaps 576(%rdi), %xmm1
13089; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13090; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13091; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13092; AVX2-FP-NEXT:    vmovaps 544(%rdi), %xmm2
13093; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13094; AVX2-FP-NEXT:    vmovaps 512(%rdi), %xmm1
13095; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13096; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
13097; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13098; AVX2-FP-NEXT:    vmovaps 736(%rdi), %xmm1
13099; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13100; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13101; AVX2-FP-NEXT:    vmovaps 704(%rdi), %xmm2
13102; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13103; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
13104; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13105; AVX2-FP-NEXT:    vmovaps 672(%rdi), %xmm3
13106; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13107; AVX2-FP-NEXT:    vmovaps 640(%rdi), %xmm2
13108; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13109; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
13110; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13111; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13112; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13113; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13114; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13115; AVX2-FP-NEXT:    vmovaps 1120(%rdi), %xmm0
13116; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13117; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm0
13118; AVX2-FP-NEXT:    vmovaps 1088(%rdi), %xmm1
13119; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13120; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13121; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13122; AVX2-FP-NEXT:    vmovaps 1056(%rdi), %xmm2
13123; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13124; AVX2-FP-NEXT:    vmovaps 1024(%rdi), %xmm1
13125; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13126; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
13127; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13128; AVX2-FP-NEXT:    vmovaps 1248(%rdi), %xmm1
13129; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13130; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13131; AVX2-FP-NEXT:    vmovaps 1216(%rdi), %xmm2
13132; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13133; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
13134; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13135; AVX2-FP-NEXT:    vmovaps 1184(%rdi), %xmm3
13136; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13137; AVX2-FP-NEXT:    vmovaps 1152(%rdi), %xmm2
13138; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13139; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
13140; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13141; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13142; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13143; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13144; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13145; AVX2-FP-NEXT:    vmovaps 1632(%rdi), %xmm0
13146; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13147; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm0
13148; AVX2-FP-NEXT:    vmovaps 1600(%rdi), %xmm1
13149; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13150; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13151; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13152; AVX2-FP-NEXT:    vmovaps 1568(%rdi), %xmm2
13153; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13154; AVX2-FP-NEXT:    vmovaps 1536(%rdi), %xmm1
13155; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13156; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
13157; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13158; AVX2-FP-NEXT:    vmovaps 1760(%rdi), %xmm1
13159; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13160; AVX2-FP-NEXT:    vbroadcastss %xmm1, %xmm1
13161; AVX2-FP-NEXT:    vmovaps 1728(%rdi), %xmm2
13162; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13163; AVX2-FP-NEXT:    vbroadcastss %xmm2, %xmm2
13164; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13165; AVX2-FP-NEXT:    vmovaps 1696(%rdi), %xmm3
13166; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13167; AVX2-FP-NEXT:    vmovaps 1664(%rdi), %xmm2
13168; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13169; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
13170; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13171; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13172; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13173; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13174; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13175; AVX2-FP-NEXT:    vmovaps 224(%rdi), %xmm0
13176; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13177; AVX2-FP-NEXT:    vbroadcastss %xmm0, %xmm0
13178; AVX2-FP-NEXT:    vmovaps 192(%rdi), %xmm11
13179; AVX2-FP-NEXT:    vbroadcastss %xmm11, %xmm1
13180; AVX2-FP-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13181; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13182; AVX2-FP-NEXT:    vmovaps 160(%rdi), %xmm2
13183; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13184; AVX2-FP-NEXT:    vmovaps 128(%rdi), %xmm1
13185; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13186; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
13187; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13188; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13189; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13190; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm8
13191; AVX2-FP-NEXT:    vbroadcastss %xmm8, %xmm1
13192; AVX2-FP-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13193; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm7
13194; AVX2-FP-NEXT:    vbroadcastss %xmm7, %xmm2
13195; AVX2-FP-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13196; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13197; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm5
13198; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm6
13199; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
13200; AVX2-FP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13201; AVX2-FP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13202; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
13203; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
13204; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13205; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13206; AVX2-FP-NEXT:    # xmm0 = mem[1,1,1,1]
13207; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
13208; AVX2-FP-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13209; AVX2-FP-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
13210; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
13211; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13212; AVX2-FP-NEXT:    vmovaps %xmm13, %xmm9
13213; AVX2-FP-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13214; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
13215; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
13216; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13217; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13218; AVX2-FP-NEXT:    # xmm2 = mem[1,1,1,1]
13219; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13220; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13221; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13222; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13223; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13224; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13225; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13226; AVX2-FP-NEXT:    # xmm0 = mem[1,1,1,1]
13227; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13228; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13229; AVX2-FP-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13230; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13231; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
13232; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13233; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
13234; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
13235; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
13236; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13237; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13238; AVX2-FP-NEXT:    # xmm2 = mem[1,1,1,1]
13239; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13240; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13241; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13242; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13243; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13244; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13245; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13246; AVX2-FP-NEXT:    # xmm0 = mem[1,1,1,1]
13247; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13248; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13249; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13250; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13251; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13252; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13253; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13254; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13255; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
13256; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13257; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13258; AVX2-FP-NEXT:    # xmm2 = mem[1,1,1,1]
13259; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13260; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13261; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13262; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13263; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13264; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13265; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13266; AVX2-FP-NEXT:    # xmm0 = mem[1,1,1,1]
13267; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13268; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13269; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13270; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13271; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13272; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13273; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13274; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13275; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13276; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13277; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13278; AVX2-FP-NEXT:    # xmm2 = mem[1,1,1,1]
13279; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13280; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13281; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13282; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13283; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13284; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13285; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
13286; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
13287; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
13288; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13289; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
13290; AVX2-FP-NEXT:    # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
13291; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13292; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13293; AVX2-FP-NEXT:    # xmm2 = mem[1,1,1,1]
13294; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13295; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13296; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13297; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13298; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13299; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13300; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13301; AVX2-FP-NEXT:    # xmm0 = mem[1,1,1,1]
13302; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13303; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13304; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13305; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13306; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13307; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13308; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13309; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13310; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
13311; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13312; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13313; AVX2-FP-NEXT:    # xmm2 = mem[1,1,1,1]
13314; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13315; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13316; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13317; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13318; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13319; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13320; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13321; AVX2-FP-NEXT:    # xmm0 = mem[1,1,1,1]
13322; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13323; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13324; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13325; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13326; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13327; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13328; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13329; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
13330; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
13331; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13332; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13333; AVX2-FP-NEXT:    # xmm2 = mem[1,1,1,1]
13334; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13335; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13336; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13337; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13338; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13339; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13340; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13341; AVX2-FP-NEXT:    # xmm0 = mem[1,1,1,1]
13342; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13343; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13344; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13345; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13346; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13347; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13348; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13349; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13350; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13351; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13352; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13353; AVX2-FP-NEXT:    # xmm2 = mem[1,1,1,1]
13354; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13355; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13356; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13357; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13358; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13359; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13360; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13361; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
13362; AVX2-FP-NEXT:    # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3]
13363; AVX2-FP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13364; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13365; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13366; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13367; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13368; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2]
13369; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
13370; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13371; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13372; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13373; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13374; AVX2-FP-NEXT:    # xmm1 = mem[2,2,2,2]
13375; AVX2-FP-NEXT:    vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
13376; AVX2-FP-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
13377; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
13378; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13379; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13380; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13381; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13382; AVX2-FP-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
13383; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13384; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13385; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13386; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13387; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13388; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2]
13389; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
13390; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13391; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13392; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13393; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
13394; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2]
13395; AVX2-FP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13396; AVX2-FP-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
13397; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13398; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13399; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13400; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13401; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
13402; AVX2-FP-NEXT:    # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3]
13403; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13404; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13405; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13406; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13407; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
13408; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
13409; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13410; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13411; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13412; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13413; AVX2-FP-NEXT:    # xmm1 = mem[2,2,2,2]
13414; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
13415; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
13416; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
13417; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13418; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13419; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13420; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
13421; AVX2-FP-NEXT:    # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
13422; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13423; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13424; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13425; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13426; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13427; AVX2-FP-NEXT:    # xmm0 = mem[2,2,2,2]
13428; AVX2-FP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13429; AVX2-FP-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
13430; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13431; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13432; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13433; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13434; AVX2-FP-NEXT:    # xmm1 = mem[2,2,2,2]
13435; AVX2-FP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13436; AVX2-FP-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
13437; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
13438; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13439; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13440; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13441; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13442; AVX2-FP-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
13443; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13444; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13445; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13446; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13447; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13448; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2]
13449; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
13450; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13451; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13452; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13453; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13454; AVX2-FP-NEXT:    # xmm1 = mem[2,2,2,2]
13455; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13456; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
13457; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13458; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13459; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13460; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13461; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13462; AVX2-FP-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
13463; AVX2-FP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13464; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13465; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13466; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13467; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13468; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2]
13469; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
13470; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13471; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13472; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13473; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13474; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
13475; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13476; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
13477; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13478; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13479; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13480; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13481; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
13482; AVX2-FP-NEXT:    # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
13483; AVX2-FP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13484; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13485; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13486; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13487; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13488; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13489; AVX2-FP-NEXT:    # xmm0 = mem[2,2,2,2]
13490; AVX2-FP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13491; AVX2-FP-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
13492; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13493; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm15
13494; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7]
13495; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13496; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2]
13497; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13498; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
13499; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3]
13500; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13501; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13502; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13503; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
13504; AVX2-FP-NEXT:    # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
13505; AVX2-FP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13506; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
13507; AVX2-FP-NEXT:    # xmm15 = mem[2,2,2,2]
13508; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13509; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3]
13510; AVX2-FP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
13511; AVX2-FP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm14
13512; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
13513; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13514; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
13515; AVX2-FP-NEXT:    # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3]
13516; AVX2-FP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
13517; AVX2-FP-NEXT:    # xmm13 = mem[2,2,2,2]
13518; AVX2-FP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
13519; AVX2-FP-NEXT:    # xmm13 = mem[0,1,2],xmm13[3]
13520; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
13521; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7]
13522; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13523; AVX2-FP-NEXT:    vmovaps (%rsp), %xmm5 # 16-byte Reload
13524; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
13525; AVX2-FP-NEXT:    # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
13526; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13527; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1]
13528; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13529; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
13530; AVX2-FP-NEXT:    # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
13531; AVX2-FP-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
13532; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
13533; AVX2-FP-NEXT:    # xmm12 = mem[2,3,2,3]
13534; AVX2-FP-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
13535; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
13536; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
13537; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13538; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13539; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
13540; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13541; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1]
13542; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13543; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload
13544; AVX2-FP-NEXT:    # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3]
13545; AVX2-FP-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
13546; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
13547; AVX2-FP-NEXT:    # xmm13 = mem[2,3,2,3]
13548; AVX2-FP-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
13549; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
13550; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
13551; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13552; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload
13553; AVX2-FP-NEXT:    # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3]
13554; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1]
13555; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13556; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload
13557; AVX2-FP-NEXT:    # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3]
13558; AVX2-FP-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
13559; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
13560; AVX2-FP-NEXT:    # xmm10 = mem[2,3,2,3]
13561; AVX2-FP-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
13562; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
13563; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
13564; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13565; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13566; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
13567; AVX2-FP-NEXT:    # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
13568; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1]
13569; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13570; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
13571; AVX2-FP-NEXT:    # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
13572; AVX2-FP-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
13573; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
13574; AVX2-FP-NEXT:    # xmm8 = mem[2,3,2,3]
13575; AVX2-FP-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
13576; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
13577; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
13578; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13579; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
13580; AVX2-FP-NEXT:    # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
13581; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13582; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
13583; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13584; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
13585; AVX2-FP-NEXT:    # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
13586; AVX2-FP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
13587; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
13588; AVX2-FP-NEXT:    # xmm6 = mem[2,3,2,3]
13589; AVX2-FP-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
13590; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
13591; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
13592; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13593; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
13594; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13595; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
13596; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13597; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
13598; AVX2-FP-NEXT:    # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3]
13599; AVX2-FP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
13600; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
13601; AVX2-FP-NEXT:    # xmm4 = mem[2,3,2,3]
13602; AVX2-FP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
13603; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
13604; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
13605; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13606; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13607; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13608; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
13609; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13610; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
13611; AVX2-FP-NEXT:    # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3]
13612; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
13613; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13614; AVX2-FP-NEXT:    # xmm2 = mem[2,3,2,3]
13615; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13616; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
13617; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13618; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13619; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13620; AVX2-FP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13621; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13622; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13623; AVX2-FP-NEXT:    # xmm0 = mem[2,3,2,3]
13624; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13625; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13626; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13627; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13628; AVX2-FP-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
13629; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1]
13630; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13631; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13632; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm0
13633; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13634; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
13635; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13636; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13637; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
13638; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm1
13639; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13640; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm2
13641; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13642; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13643; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13644; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13645; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13646; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm2
13647; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13648; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm3
13649; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13650; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm15
13651; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
13652; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13653; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
13654; AVX2-FP-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13655; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
13656; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13657; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
13658; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13659; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13660; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm0
13661; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13662; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm1
13663; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13664; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13665; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
13666; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm1
13667; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13668; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm2
13669; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13670; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13671; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13672; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13673; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13674; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm2
13675; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13676; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm3
13677; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13678; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm4
13679; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13680; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm1
13681; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13682; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
13683; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
13684; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
13685; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13686; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13687; AVX2-FP-NEXT:    vmovaps 544(%rdi), %ymm0
13688; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13689; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm1
13690; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13691; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13692; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
13693; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm1
13694; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13695; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm2
13696; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13697; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13698; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13699; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13700; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13701; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm2
13702; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13703; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm3
13704; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13705; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm4
13706; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm1
13707; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13708; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
13709; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13710; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
13711; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
13712; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13713; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13714; AVX2-FP-NEXT:    vmovaps 800(%rdi), %ymm0
13715; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13716; AVX2-FP-NEXT:    vmovaps 768(%rdi), %ymm1
13717; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13718; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13719; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
13720; AVX2-FP-NEXT:    vmovaps 864(%rdi), %ymm1
13721; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13722; AVX2-FP-NEXT:    vmovaps 832(%rdi), %ymm2
13723; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13724; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13725; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13726; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13727; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13728; AVX2-FP-NEXT:    vmovaps 992(%rdi), %ymm2
13729; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13730; AVX2-FP-NEXT:    vmovaps 960(%rdi), %ymm5
13731; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13732; AVX2-FP-NEXT:    vmovaps 928(%rdi), %ymm3
13733; AVX2-FP-NEXT:    vmovaps 896(%rdi), %ymm1
13734; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13735; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
13736; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13737; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
13738; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
13739; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13740; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13741; AVX2-FP-NEXT:    vmovaps 1056(%rdi), %ymm0
13742; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13743; AVX2-FP-NEXT:    vmovaps 1024(%rdi), %ymm1
13744; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13745; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13746; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
13747; AVX2-FP-NEXT:    vmovaps 1120(%rdi), %ymm1
13748; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13749; AVX2-FP-NEXT:    vmovaps 1088(%rdi), %ymm2
13750; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13751; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13752; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13753; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13754; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13755; AVX2-FP-NEXT:    vmovaps 1248(%rdi), %ymm1
13756; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13757; AVX2-FP-NEXT:    vmovaps 1216(%rdi), %ymm7
13758; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13759; AVX2-FP-NEXT:    vmovaps 1184(%rdi), %ymm2
13760; AVX2-FP-NEXT:    vmovaps 1152(%rdi), %ymm6
13761; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13762; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5]
13763; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13764; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5]
13765; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
13766; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
13767; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13768; AVX2-FP-NEXT:    vmovaps 1312(%rdi), %ymm0
13769; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13770; AVX2-FP-NEXT:    vmovaps 1280(%rdi), %ymm6
13771; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13772; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5]
13773; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm6
13774; AVX2-FP-NEXT:    vmovaps 1376(%rdi), %ymm0
13775; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13776; AVX2-FP-NEXT:    vmovaps 1344(%rdi), %ymm7
13777; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13778; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5]
13779; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13780; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2]
13781; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3]
13782; AVX2-FP-NEXT:    vmovaps 1504(%rdi), %ymm6
13783; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13784; AVX2-FP-NEXT:    vmovaps 1472(%rdi), %ymm10
13785; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13786; AVX2-FP-NEXT:    vmovaps 1440(%rdi), %ymm0
13787; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
13788; AVX2-FP-NEXT:    vmovaps 1408(%rdi), %ymm9
13789; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13790; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
13791; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
13792; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
13793; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
13794; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13795; AVX2-FP-NEXT:    vmovaps 1568(%rdi), %ymm6
13796; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13797; AVX2-FP-NEXT:    vmovaps 1536(%rdi), %ymm7
13798; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13799; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
13800; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm9
13801; AVX2-FP-NEXT:    vmovaps 1632(%rdi), %ymm6
13802; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13803; AVX2-FP-NEXT:    vmovaps 1600(%rdi), %ymm7
13804; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13805; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
13806; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13807; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2]
13808; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3]
13809; AVX2-FP-NEXT:    vmovaps 1760(%rdi), %ymm9
13810; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13811; AVX2-FP-NEXT:    vmovaps 1728(%rdi), %ymm6
13812; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13813; AVX2-FP-NEXT:    vmovaps 1696(%rdi), %ymm7
13814; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13815; AVX2-FP-NEXT:    vmovaps 1664(%rdi), %ymm11
13816; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13817; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5]
13818; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
13819; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13820; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2]
13821; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
13822; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13823; AVX2-FP-NEXT:    vmovaps 1824(%rdi), %ymm6
13824; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13825; AVX2-FP-NEXT:    vmovaps 1792(%rdi), %ymm7
13826; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13827; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
13828; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm11
13829; AVX2-FP-NEXT:    vmovaps 1888(%rdi), %ymm6
13830; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13831; AVX2-FP-NEXT:    vmovaps 1856(%rdi), %ymm7
13832; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13833; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
13834; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13835; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2]
13836; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3]
13837; AVX2-FP-NEXT:    vmovaps 2016(%rdi), %ymm11
13838; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13839; AVX2-FP-NEXT:    vmovaps 1984(%rdi), %ymm6
13840; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13841; AVX2-FP-NEXT:    vmovaps 1952(%rdi), %ymm7
13842; AVX2-FP-NEXT:    vmovaps 1920(%rdi), %ymm9
13843; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13844; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
13845; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13846; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5]
13847; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2]
13848; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
13849; AVX2-FP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13850; AVX2-FP-NEXT:    vbroadcastss 148(%rdi), %ymm13
13851; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
13852; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
13853; AVX2-FP-NEXT:    # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
13854; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13855; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm14
13856; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13857; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5]
13858; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13859; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7]
13860; AVX2-FP-NEXT:    vextractf128 $1, %ymm15, %xmm15
13861; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
13862; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
13863; AVX2-FP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13864; AVX2-FP-NEXT:    vbroadcastss 404(%rdi), %ymm13
13865; AVX2-FP-NEXT:    vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
13866; AVX2-FP-NEXT:    # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7]
13867; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
13868; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13869; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm13
13870; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
13871; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5]
13872; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13873; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
13874; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm14
13875; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
13876; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
13877; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13878; AVX2-FP-NEXT:    vbroadcastss 660(%rdi), %ymm12
13879; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
13880; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7]
13881; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13882; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
13883; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13884; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5]
13885; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
13886; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
13887; AVX2-FP-NEXT:    vextractf128 $1, %ymm12, %xmm12
13888; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
13889; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
13890; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13891; AVX2-FP-NEXT:    vbroadcastss 916(%rdi), %ymm4
13892; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
13893; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7]
13894; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13895; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
13896; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13897; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5]
13898; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
13899; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7]
13900; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
13901; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
13902; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
13903; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13904; AVX2-FP-NEXT:    vbroadcastss 1172(%rdi), %ymm3
13905; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13906; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
13907; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13908; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
13909; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13910; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5]
13911; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13912; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
13913; AVX2-FP-NEXT:    vextractf128 $1, %ymm3, %xmm3
13914; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
13915; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13916; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13917; AVX2-FP-NEXT:    vbroadcastss 1428(%rdi), %ymm1
13918; AVX2-FP-NEXT:    vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
13919; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7]
13920; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13921; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13922; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
13923; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13924; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
13925; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13926; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
13927; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
13928; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
13929; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13930; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13931; AVX2-FP-NEXT:    vbroadcastss 1684(%rdi), %ymm0
13932; AVX2-FP-NEXT:    vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13933; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7]
13934; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13935; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
13936; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13937; AVX2-FP-NEXT:    vextractf128 $1, %ymm1, %xmm1
13938; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13939; AVX2-FP-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
13940; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13941; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
13942; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
13943; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13944; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13945; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13946; AVX2-FP-NEXT:    vbroadcastss 1940(%rdi), %ymm0
13947; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
13948; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
13949; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13950; AVX2-FP-NEXT:    vextractf128 $1, %ymm1, %xmm1
13951; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13952; AVX2-FP-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
13953; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13954; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
13955; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
13956; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13957; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13958; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13959; AVX2-FP-NEXT:    vbroadcastss 248(%rdi), %ymm0
13960; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13961; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
13962; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13963; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
13964; AVX2-FP-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
13965; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13966; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
13967; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13968; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13969; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
13970; AVX2-FP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
13971; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13972; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm1
13973; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2]
13974; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
13975; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
13976; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13977; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13978; AVX2-FP-NEXT:    vbroadcastss 504(%rdi), %ymm0
13979; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13980; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
13981; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13982; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
13983; AVX2-FP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
13984; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13985; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7]
13986; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13987; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13988; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
13989; AVX2-FP-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
13990; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13991; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm1
13992; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
13993; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
13994; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
13995; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13996; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13997; AVX2-FP-NEXT:    vbroadcastss 760(%rdi), %ymm0
13998; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13999; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14000; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14001; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
14002; AVX2-FP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14003; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14004; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
14005; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14006; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14007; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
14008; AVX2-FP-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14009; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14010; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm1
14011; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
14012; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14013; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
14014; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14015; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14016; AVX2-FP-NEXT:    vbroadcastss 1016(%rdi), %ymm0
14017; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14018; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14019; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14020; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
14021; AVX2-FP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14022; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14023; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7]
14024; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14025; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14026; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
14027; AVX2-FP-NEXT:    # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14028; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm1
14029; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2]
14030; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14031; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
14032; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14033; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14034; AVX2-FP-NEXT:    vbroadcastss 1272(%rdi), %ymm0
14035; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14036; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14037; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14038; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload
14039; AVX2-FP-NEXT:    # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14040; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
14041; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14042; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
14043; AVX2-FP-NEXT:    # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14044; AVX2-FP-NEXT:    vextractf128 $1, %ymm13, %xmm1
14045; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2]
14046; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14047; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
14048; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14049; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14050; AVX2-FP-NEXT:    vbroadcastss 1528(%rdi), %ymm0
14051; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14052; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14053; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14054; AVX2-FP-NEXT:    vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
14055; AVX2-FP-NEXT:    # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14056; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
14057; AVX2-FP-NEXT:    # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
14058; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14059; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
14060; AVX2-FP-NEXT:    # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14061; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm1
14062; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2]
14063; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14064; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
14065; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14066; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14067; AVX2-FP-NEXT:    vbroadcastss 1784(%rdi), %ymm0
14068; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14069; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14070; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14071; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
14072; AVX2-FP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14073; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14074; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
14075; AVX2-FP-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14076; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14077; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
14078; AVX2-FP-NEXT:    # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14079; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm1
14080; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2]
14081; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14082; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
14083; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14084; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14085; AVX2-FP-NEXT:    vbroadcastss 2040(%rdi), %ymm0
14086; AVX2-FP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
14087; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
14088; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14089; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
14090; AVX2-FP-NEXT:    # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14091; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14092; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
14093; AVX2-FP-NEXT:    # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14094; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14095; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
14096; AVX2-FP-NEXT:    # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14097; AVX2-FP-NEXT:    vextractf128 $1, %ymm3, %xmm0
14098; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2]
14099; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
14100; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
14101; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14102; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
14103; AVX2-FP-NEXT:    vbroadcastss 220(%rdi), %ymm0
14104; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14105; AVX2-FP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14106; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14107; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
14108; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14109; AVX2-FP-NEXT:    vextractf128 $1, %ymm1, %xmm1
14110; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
14111; AVX2-FP-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
14112; AVX2-FP-NEXT:    vextractf128 $1, %ymm15, %xmm15
14113; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
14114; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14115; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14116; AVX2-FP-NEXT:    vbroadcastss 476(%rdi), %ymm0
14117; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14118; AVX2-FP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14119; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14120; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
14121; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14122; AVX2-FP-NEXT:    vextractf128 $1, %ymm1, %xmm1
14123; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
14124; AVX2-FP-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
14125; AVX2-FP-NEXT:    vextractf128 $1, %ymm15, %xmm15
14126; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
14127; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14128; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14129; AVX2-FP-NEXT:    vbroadcastss 732(%rdi), %ymm0
14130; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14131; AVX2-FP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14132; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14133; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
14134; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14135; AVX2-FP-NEXT:    vextractf128 $1, %ymm1, %xmm1
14136; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
14137; AVX2-FP-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
14138; AVX2-FP-NEXT:    vextractf128 $1, %ymm15, %xmm15
14139; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
14140; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14141; AVX2-FP-NEXT:    vbroadcastss 988(%rdi), %ymm0
14142; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14143; AVX2-FP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14144; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14145; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
14146; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm1
14147; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
14148; AVX2-FP-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
14149; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm14
14150; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
14151; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14152; AVX2-FP-NEXT:    vbroadcastss 1244(%rdi), %ymm0
14153; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14154; AVX2-FP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14155; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
14156; AVX2-FP-NEXT:    vextractf128 $1, %ymm11, %xmm1
14157; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7]
14158; AVX2-FP-NEXT:    vextractf128 $1, %ymm11, %xmm11
14159; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
14160; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14161; AVX2-FP-NEXT:    vbroadcastss 1500(%rdi), %ymm0
14162; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14163; AVX2-FP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14164; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3]
14165; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
14166; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
14167; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm9
14168; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
14169; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
14170; AVX2-FP-NEXT:    vbroadcastss 1756(%rdi), %ymm0
14171; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14172; AVX2-FP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14173; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3]
14174; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
14175; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7]
14176; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
14177; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
14178; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
14179; AVX2-FP-NEXT:    vbroadcastss 2012(%rdi), %ymm0
14180; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14181; AVX2-FP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14182; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3]
14183; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
14184; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
14185; AVX2-FP-NEXT:    vextractf128 $1, %ymm3, %xmm3
14186; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
14187; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
14188; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14189; AVX2-FP-NEXT:    vmovaps %ymm2, 192(%rsi)
14190; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14191; AVX2-FP-NEXT:    vmovaps %ymm2, 128(%rsi)
14192; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14193; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rsi)
14194; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14195; AVX2-FP-NEXT:    vmovaps %ymm2, (%rsi)
14196; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14197; AVX2-FP-NEXT:    vmovaps %ymm2, 224(%rsi)
14198; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14199; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%rsi)
14200; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14201; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rsi)
14202; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14203; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rsi)
14204; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14205; AVX2-FP-NEXT:    vmovaps %ymm2, 192(%rdx)
14206; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14207; AVX2-FP-NEXT:    vmovaps %ymm2, 128(%rdx)
14208; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14209; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rdx)
14210; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14211; AVX2-FP-NEXT:    vmovaps %ymm2, (%rdx)
14212; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14213; AVX2-FP-NEXT:    vmovaps %ymm2, 224(%rdx)
14214; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14215; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%rdx)
14216; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14217; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rdx)
14218; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14219; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rdx)
14220; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14221; AVX2-FP-NEXT:    vmovaps %ymm2, 192(%rcx)
14222; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14223; AVX2-FP-NEXT:    vmovaps %ymm2, 128(%rcx)
14224; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14225; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rcx)
14226; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14227; AVX2-FP-NEXT:    vmovaps %ymm2, (%rcx)
14228; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14229; AVX2-FP-NEXT:    vmovaps %ymm2, 224(%rcx)
14230; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14231; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%rcx)
14232; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14233; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rcx)
14234; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14235; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rcx)
14236; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14237; AVX2-FP-NEXT:    vmovaps %ymm2, 192(%r8)
14238; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14239; AVX2-FP-NEXT:    vmovaps %ymm2, 128(%r8)
14240; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14241; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%r8)
14242; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14243; AVX2-FP-NEXT:    vmovaps %ymm2, (%r8)
14244; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14245; AVX2-FP-NEXT:    vmovaps %ymm2, 224(%r8)
14246; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14247; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%r8)
14248; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14249; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%r8)
14250; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14251; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%r8)
14252; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14253; AVX2-FP-NEXT:    vmovaps %ymm2, 224(%r9)
14254; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14255; AVX2-FP-NEXT:    vmovaps %ymm2, 192(%r9)
14256; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14257; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%r9)
14258; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14259; AVX2-FP-NEXT:    vmovaps %ymm2, 128(%r9)
14260; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14261; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%r9)
14262; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14263; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%r9)
14264; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14265; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%r9)
14266; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14267; AVX2-FP-NEXT:    vmovaps %ymm2, (%r9)
14268; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14269; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14270; AVX2-FP-NEXT:    vmovaps %ymm2, 224(%rax)
14271; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14272; AVX2-FP-NEXT:    vmovaps %ymm2, 192(%rax)
14273; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14274; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%rax)
14275; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14276; AVX2-FP-NEXT:    vmovaps %ymm2, 128(%rax)
14277; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14278; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rax)
14279; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14280; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rax)
14281; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14282; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rax)
14283; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14284; AVX2-FP-NEXT:    vmovaps %ymm2, (%rax)
14285; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14286; AVX2-FP-NEXT:    vmovups (%rsp), %ymm2 # 32-byte Reload
14287; AVX2-FP-NEXT:    vmovaps %ymm2, 224(%rax)
14288; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14289; AVX2-FP-NEXT:    vmovaps %ymm2, 192(%rax)
14290; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14291; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%rax)
14292; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14293; AVX2-FP-NEXT:    vmovaps %ymm2, 128(%rax)
14294; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14295; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rax)
14296; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14297; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rax)
14298; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14299; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rax)
14300; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14301; AVX2-FP-NEXT:    vmovaps %ymm2, (%rax)
14302; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14303; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
14304; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rax)
14305; AVX2-FP-NEXT:    vmovaps %ymm8, 160(%rax)
14306; AVX2-FP-NEXT:    vmovaps %ymm1, 128(%rax)
14307; AVX2-FP-NEXT:    vmovaps %ymm14, 96(%rax)
14308; AVX2-FP-NEXT:    vmovaps %ymm15, 64(%rax)
14309; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14310; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
14311; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14312; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
14313; AVX2-FP-NEXT:    addq $3528, %rsp # imm = 0xDC8
14314; AVX2-FP-NEXT:    vzeroupper
14315; AVX2-FP-NEXT:    retq
14316;
14317; AVX2-FCP-LABEL: load_i32_stride8_vf64:
14318; AVX2-FCP:       # %bb.0:
14319; AVX2-FCP-NEXT:    subq $3528, %rsp # imm = 0xDC8
14320; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %xmm10
14321; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %xmm0
14322; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14323; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
14324; AVX2-FCP-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14325; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %xmm9
14326; AVX2-FCP-NEXT:    vbroadcastss %xmm9, %xmm1
14327; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %xmm2
14328; AVX2-FCP-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
14329; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
14330; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14331; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14332; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %xmm1
14333; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14334; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %xmm2
14335; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14336; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14337; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14338; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %xmm13
14339; AVX2-FCP-NEXT:    vbroadcastss %xmm13, %xmm2
14340; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %xmm3
14341; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14342; AVX2-FCP-NEXT:    vbroadcastss %xmm3, %xmm3
14343; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14344; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14345; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
14346; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14347; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14348; AVX2-FCP-NEXT:    vmovaps 800(%rdi), %xmm0
14349; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14350; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %xmm1
14351; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14352; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14353; AVX2-FCP-NEXT:    vmovaps 864(%rdi), %xmm12
14354; AVX2-FCP-NEXT:    vbroadcastss %xmm12, %xmm1
14355; AVX2-FCP-NEXT:    vmovaps 832(%rdi), %xmm2
14356; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14357; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
14358; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14359; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14360; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %xmm1
14361; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14362; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14363; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %xmm2
14364; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14365; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
14366; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14367; AVX2-FCP-NEXT:    vmovaps 928(%rdi), %xmm2
14368; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14369; AVX2-FCP-NEXT:    vmovaps 896(%rdi), %xmm3
14370; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14371; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14372; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14373; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14374; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14375; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14376; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14377; AVX2-FCP-NEXT:    vmovaps 1376(%rdi), %xmm0
14378; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14379; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm0
14380; AVX2-FCP-NEXT:    vmovaps 1344(%rdi), %xmm1
14381; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14382; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14383; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14384; AVX2-FCP-NEXT:    vmovaps 1312(%rdi), %xmm1
14385; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14386; AVX2-FCP-NEXT:    vmovaps 1280(%rdi), %xmm2
14387; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14388; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14389; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14390; AVX2-FCP-NEXT:    vmovaps 1504(%rdi), %xmm1
14391; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14392; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14393; AVX2-FCP-NEXT:    vmovaps 1472(%rdi), %xmm2
14394; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14395; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
14396; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14397; AVX2-FCP-NEXT:    vmovaps 1440(%rdi), %xmm2
14398; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14399; AVX2-FCP-NEXT:    vmovaps 1408(%rdi), %xmm3
14400; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14401; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14402; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14403; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14404; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14405; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14406; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14407; AVX2-FCP-NEXT:    vmovaps 1888(%rdi), %xmm0
14408; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14409; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm0
14410; AVX2-FCP-NEXT:    vmovaps 1856(%rdi), %xmm1
14411; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14412; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14413; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14414; AVX2-FCP-NEXT:    vmovaps 1824(%rdi), %xmm1
14415; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14416; AVX2-FCP-NEXT:    vmovaps 1792(%rdi), %xmm2
14417; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14418; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14419; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
14420; AVX2-FCP-NEXT:    vmovaps 2016(%rdi), %xmm0
14421; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14422; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm1
14423; AVX2-FCP-NEXT:    vmovaps 1984(%rdi), %xmm0
14424; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14425; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm2
14426; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14427; AVX2-FCP-NEXT:    vmovaps 1952(%rdi), %xmm0
14428; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14429; AVX2-FCP-NEXT:    vmovaps 1920(%rdi), %xmm2
14430; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14431; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
14432; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14433; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14434; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14435; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7]
14436; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14437; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %xmm0
14438; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14439; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm0
14440; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %xmm1
14441; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14442; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14443; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14444; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %xmm2
14445; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14446; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %xmm1
14447; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14448; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
14449; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14450; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %xmm1
14451; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14452; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14453; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %xmm2
14454; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14455; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
14456; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14457; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %xmm3
14458; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14459; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %xmm2
14460; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14461; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14462; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14463; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14464; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14465; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14466; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14467; AVX2-FCP-NEXT:    vmovaps 1120(%rdi), %xmm0
14468; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14469; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm0
14470; AVX2-FCP-NEXT:    vmovaps 1088(%rdi), %xmm1
14471; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14472; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14473; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14474; AVX2-FCP-NEXT:    vmovaps 1056(%rdi), %xmm2
14475; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14476; AVX2-FCP-NEXT:    vmovaps 1024(%rdi), %xmm1
14477; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14478; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
14479; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14480; AVX2-FCP-NEXT:    vmovaps 1248(%rdi), %xmm1
14481; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14482; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14483; AVX2-FCP-NEXT:    vmovaps 1216(%rdi), %xmm2
14484; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14485; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
14486; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14487; AVX2-FCP-NEXT:    vmovaps 1184(%rdi), %xmm3
14488; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14489; AVX2-FCP-NEXT:    vmovaps 1152(%rdi), %xmm2
14490; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14491; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14492; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14493; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14494; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14495; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14496; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14497; AVX2-FCP-NEXT:    vmovaps 1632(%rdi), %xmm0
14498; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14499; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm0
14500; AVX2-FCP-NEXT:    vmovaps 1600(%rdi), %xmm1
14501; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14502; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14503; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14504; AVX2-FCP-NEXT:    vmovaps 1568(%rdi), %xmm2
14505; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14506; AVX2-FCP-NEXT:    vmovaps 1536(%rdi), %xmm1
14507; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14508; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
14509; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14510; AVX2-FCP-NEXT:    vmovaps 1760(%rdi), %xmm1
14511; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14512; AVX2-FCP-NEXT:    vbroadcastss %xmm1, %xmm1
14513; AVX2-FCP-NEXT:    vmovaps 1728(%rdi), %xmm2
14514; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14515; AVX2-FCP-NEXT:    vbroadcastss %xmm2, %xmm2
14516; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14517; AVX2-FCP-NEXT:    vmovaps 1696(%rdi), %xmm3
14518; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14519; AVX2-FCP-NEXT:    vmovaps 1664(%rdi), %xmm2
14520; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14521; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14522; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14523; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14524; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14525; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14526; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14527; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %xmm0
14528; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14529; AVX2-FCP-NEXT:    vbroadcastss %xmm0, %xmm0
14530; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %xmm11
14531; AVX2-FCP-NEXT:    vbroadcastss %xmm11, %xmm1
14532; AVX2-FCP-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14533; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14534; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %xmm2
14535; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14536; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %xmm1
14537; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14538; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
14539; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14540; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14541; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14542; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm8
14543; AVX2-FCP-NEXT:    vbroadcastss %xmm8, %xmm1
14544; AVX2-FCP-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14545; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm7
14546; AVX2-FCP-NEXT:    vbroadcastss %xmm7, %xmm2
14547; AVX2-FCP-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14548; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14549; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm5
14550; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm6
14551; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
14552; AVX2-FCP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14553; AVX2-FCP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14554; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
14555; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
14556; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14557; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14558; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1]
14559; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
14560; AVX2-FCP-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14561; AVX2-FCP-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
14562; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
14563; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14564; AVX2-FCP-NEXT:    vmovaps %xmm13, %xmm9
14565; AVX2-FCP-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14566; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
14567; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
14568; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14569; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14570; AVX2-FCP-NEXT:    # xmm2 = mem[1,1,1,1]
14571; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14572; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14573; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14574; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14575; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14576; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14577; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14578; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1]
14579; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14580; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14581; AVX2-FCP-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14582; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14583; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
14584; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14585; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
14586; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
14587; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
14588; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14589; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14590; AVX2-FCP-NEXT:    # xmm2 = mem[1,1,1,1]
14591; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14592; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14593; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14594; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14595; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14596; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14597; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14598; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1]
14599; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14600; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14601; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14602; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14603; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14604; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14605; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
14606; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14607; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
14608; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14609; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14610; AVX2-FCP-NEXT:    # xmm2 = mem[1,1,1,1]
14611; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14612; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14613; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14614; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14615; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14616; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14617; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14618; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1]
14619; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14620; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14621; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14622; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14623; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14624; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14625; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14626; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14627; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14628; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14629; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14630; AVX2-FCP-NEXT:    # xmm2 = mem[1,1,1,1]
14631; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14632; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14633; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14634; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14635; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14636; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14637; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
14638; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
14639; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
14640; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14641; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
14642; AVX2-FCP-NEXT:    # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
14643; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14644; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14645; AVX2-FCP-NEXT:    # xmm2 = mem[1,1,1,1]
14646; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14647; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14648; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14649; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14650; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14651; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14652; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14653; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1]
14654; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14655; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14656; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14657; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14658; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14659; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14660; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14661; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14662; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
14663; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14664; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14665; AVX2-FCP-NEXT:    # xmm2 = mem[1,1,1,1]
14666; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14667; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14668; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14669; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14670; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14671; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14672; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14673; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1]
14674; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14675; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14676; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14677; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14678; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14679; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14680; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14681; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
14682; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
14683; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14684; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14685; AVX2-FCP-NEXT:    # xmm2 = mem[1,1,1,1]
14686; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14687; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14688; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14689; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14690; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14691; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14692; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14693; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1]
14694; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14695; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14696; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14697; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14698; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14699; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14700; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14701; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14702; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14703; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14704; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14705; AVX2-FCP-NEXT:    # xmm2 = mem[1,1,1,1]
14706; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14707; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14708; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14709; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14710; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14711; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14712; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14713; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
14714; AVX2-FCP-NEXT:    # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3]
14715; AVX2-FCP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14716; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14717; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14718; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14719; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14720; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2]
14721; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
14722; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14723; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14724; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14725; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14726; AVX2-FCP-NEXT:    # xmm1 = mem[2,2,2,2]
14727; AVX2-FCP-NEXT:    vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
14728; AVX2-FCP-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
14729; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
14730; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14731; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14732; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14733; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14734; AVX2-FCP-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
14735; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14736; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14737; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14738; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14739; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14740; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2]
14741; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
14742; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14743; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14744; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14745; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
14746; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2]
14747; AVX2-FCP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14748; AVX2-FCP-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
14749; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14750; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14751; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14752; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14753; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
14754; AVX2-FCP-NEXT:    # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3]
14755; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14756; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14757; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14758; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14759; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
14760; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
14761; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14762; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14763; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14764; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14765; AVX2-FCP-NEXT:    # xmm1 = mem[2,2,2,2]
14766; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
14767; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
14768; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
14769; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14770; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14771; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14772; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
14773; AVX2-FCP-NEXT:    # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
14774; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14775; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14776; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14777; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14778; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14779; AVX2-FCP-NEXT:    # xmm0 = mem[2,2,2,2]
14780; AVX2-FCP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14781; AVX2-FCP-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
14782; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14783; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14784; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14785; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14786; AVX2-FCP-NEXT:    # xmm1 = mem[2,2,2,2]
14787; AVX2-FCP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14788; AVX2-FCP-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
14789; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
14790; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14791; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14792; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14793; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14794; AVX2-FCP-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
14795; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14796; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14797; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14798; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14799; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14800; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2]
14801; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
14802; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14803; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14804; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14805; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14806; AVX2-FCP-NEXT:    # xmm1 = mem[2,2,2,2]
14807; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14808; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
14809; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14810; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14811; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14812; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14813; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14814; AVX2-FCP-NEXT:    # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
14815; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14816; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14817; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14818; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14819; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14820; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2]
14821; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
14822; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14823; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14824; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14825; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14826; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
14827; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14828; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
14829; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14830; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14831; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14832; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14833; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
14834; AVX2-FCP-NEXT:    # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
14835; AVX2-FCP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14836; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14837; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14838; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14839; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14840; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14841; AVX2-FCP-NEXT:    # xmm0 = mem[2,2,2,2]
14842; AVX2-FCP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14843; AVX2-FCP-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
14844; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14845; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm15
14846; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7]
14847; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14848; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2]
14849; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14850; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
14851; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3]
14852; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14853; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14854; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14855; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
14856; AVX2-FCP-NEXT:    # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
14857; AVX2-FCP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14858; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
14859; AVX2-FCP-NEXT:    # xmm15 = mem[2,2,2,2]
14860; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14861; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3]
14862; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
14863; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm14
14864; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
14865; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14866; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
14867; AVX2-FCP-NEXT:    # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3]
14868; AVX2-FCP-NEXT:    vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
14869; AVX2-FCP-NEXT:    # xmm13 = mem[2,2,2,2]
14870; AVX2-FCP-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
14871; AVX2-FCP-NEXT:    # xmm13 = mem[0,1,2],xmm13[3]
14872; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
14873; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7]
14874; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14875; AVX2-FCP-NEXT:    vmovaps (%rsp), %xmm5 # 16-byte Reload
14876; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
14877; AVX2-FCP-NEXT:    # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
14878; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14879; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1]
14880; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14881; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
14882; AVX2-FCP-NEXT:    # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
14883; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
14884; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
14885; AVX2-FCP-NEXT:    # xmm12 = mem[2,3,2,3]
14886; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
14887; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
14888; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
14889; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14890; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14891; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
14892; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14893; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1]
14894; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14895; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload
14896; AVX2-FCP-NEXT:    # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3]
14897; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
14898; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
14899; AVX2-FCP-NEXT:    # xmm13 = mem[2,3,2,3]
14900; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
14901; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
14902; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
14903; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14904; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload
14905; AVX2-FCP-NEXT:    # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3]
14906; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1]
14907; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14908; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload
14909; AVX2-FCP-NEXT:    # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3]
14910; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
14911; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
14912; AVX2-FCP-NEXT:    # xmm10 = mem[2,3,2,3]
14913; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
14914; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
14915; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
14916; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14917; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14918; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
14919; AVX2-FCP-NEXT:    # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
14920; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1]
14921; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14922; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
14923; AVX2-FCP-NEXT:    # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
14924; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
14925; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
14926; AVX2-FCP-NEXT:    # xmm8 = mem[2,3,2,3]
14927; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
14928; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
14929; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
14930; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14931; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
14932; AVX2-FCP-NEXT:    # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
14933; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14934; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
14935; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14936; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
14937; AVX2-FCP-NEXT:    # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
14938; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
14939; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
14940; AVX2-FCP-NEXT:    # xmm6 = mem[2,3,2,3]
14941; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
14942; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
14943; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
14944; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14945; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
14946; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14947; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
14948; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14949; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
14950; AVX2-FCP-NEXT:    # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3]
14951; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
14952; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
14953; AVX2-FCP-NEXT:    # xmm4 = mem[2,3,2,3]
14954; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
14955; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
14956; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
14957; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14958; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14959; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14960; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
14961; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14962; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
14963; AVX2-FCP-NEXT:    # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3]
14964; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
14965; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14966; AVX2-FCP-NEXT:    # xmm2 = mem[2,3,2,3]
14967; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14968; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
14969; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14970; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14971; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14972; AVX2-FCP-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14973; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14974; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14975; AVX2-FCP-NEXT:    # xmm0 = mem[2,3,2,3]
14976; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14977; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14978; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14979; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14980; AVX2-FCP-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
14981; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1]
14982; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14983; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14984; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm0
14985; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14986; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
14987; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14988; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
14989; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
14990; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
14991; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14992; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm2
14993; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14994; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
14995; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14996; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
14997; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14998; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm2
14999; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15000; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm3
15001; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15002; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm15
15003; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm1
15004; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15005; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
15006; AVX2-FCP-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15007; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
15008; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15009; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
15010; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15011; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15012; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm0
15013; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15014; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm1
15015; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15016; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15017; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
15018; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm1
15019; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15020; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm2
15021; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15022; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15023; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15024; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15025; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15026; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm2
15027; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15028; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm3
15029; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15030; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm4
15031; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15032; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm1
15033; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15034; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
15035; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
15036; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
15037; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15038; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15039; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm0
15040; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15041; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm1
15042; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15043; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15044; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
15045; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm1
15046; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15047; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm2
15048; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15049; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15050; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15051; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15052; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15053; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm2
15054; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15055; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm3
15056; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15057; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm4
15058; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm1
15059; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15060; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
15061; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15062; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
15063; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
15064; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15065; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15066; AVX2-FCP-NEXT:    vmovaps 800(%rdi), %ymm0
15067; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15068; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %ymm1
15069; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15070; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15071; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
15072; AVX2-FCP-NEXT:    vmovaps 864(%rdi), %ymm1
15073; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15074; AVX2-FCP-NEXT:    vmovaps 832(%rdi), %ymm2
15075; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15076; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15077; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15078; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15079; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15080; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %ymm2
15081; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15082; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %ymm5
15083; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15084; AVX2-FCP-NEXT:    vmovaps 928(%rdi), %ymm3
15085; AVX2-FCP-NEXT:    vmovaps 896(%rdi), %ymm1
15086; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15087; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
15088; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15089; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
15090; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
15091; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15092; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15093; AVX2-FCP-NEXT:    vmovaps 1056(%rdi), %ymm0
15094; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15095; AVX2-FCP-NEXT:    vmovaps 1024(%rdi), %ymm1
15096; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15097; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15098; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
15099; AVX2-FCP-NEXT:    vmovaps 1120(%rdi), %ymm1
15100; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15101; AVX2-FCP-NEXT:    vmovaps 1088(%rdi), %ymm2
15102; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15103; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15104; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15105; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15106; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15107; AVX2-FCP-NEXT:    vmovaps 1248(%rdi), %ymm1
15108; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15109; AVX2-FCP-NEXT:    vmovaps 1216(%rdi), %ymm7
15110; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15111; AVX2-FCP-NEXT:    vmovaps 1184(%rdi), %ymm2
15112; AVX2-FCP-NEXT:    vmovaps 1152(%rdi), %ymm6
15113; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15114; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5]
15115; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15116; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5]
15117; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
15118; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
15119; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15120; AVX2-FCP-NEXT:    vmovaps 1312(%rdi), %ymm0
15121; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15122; AVX2-FCP-NEXT:    vmovaps 1280(%rdi), %ymm6
15123; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15124; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5]
15125; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm6
15126; AVX2-FCP-NEXT:    vmovaps 1376(%rdi), %ymm0
15127; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15128; AVX2-FCP-NEXT:    vmovaps 1344(%rdi), %ymm7
15129; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15130; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5]
15131; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15132; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2]
15133; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3]
15134; AVX2-FCP-NEXT:    vmovaps 1504(%rdi), %ymm6
15135; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15136; AVX2-FCP-NEXT:    vmovaps 1472(%rdi), %ymm10
15137; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15138; AVX2-FCP-NEXT:    vmovaps 1440(%rdi), %ymm0
15139; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
15140; AVX2-FCP-NEXT:    vmovaps 1408(%rdi), %ymm9
15141; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15142; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
15143; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
15144; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
15145; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
15146; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15147; AVX2-FCP-NEXT:    vmovaps 1568(%rdi), %ymm6
15148; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15149; AVX2-FCP-NEXT:    vmovaps 1536(%rdi), %ymm7
15150; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15151; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
15152; AVX2-FCP-NEXT:    vextractf128 $1, %ymm7, %xmm9
15153; AVX2-FCP-NEXT:    vmovaps 1632(%rdi), %ymm6
15154; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15155; AVX2-FCP-NEXT:    vmovaps 1600(%rdi), %ymm7
15156; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15157; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
15158; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15159; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2]
15160; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3]
15161; AVX2-FCP-NEXT:    vmovaps 1760(%rdi), %ymm9
15162; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15163; AVX2-FCP-NEXT:    vmovaps 1728(%rdi), %ymm6
15164; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15165; AVX2-FCP-NEXT:    vmovaps 1696(%rdi), %ymm7
15166; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15167; AVX2-FCP-NEXT:    vmovaps 1664(%rdi), %ymm11
15168; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15169; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5]
15170; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
15171; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15172; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2]
15173; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
15174; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15175; AVX2-FCP-NEXT:    vmovaps 1824(%rdi), %ymm6
15176; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15177; AVX2-FCP-NEXT:    vmovaps 1792(%rdi), %ymm7
15178; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15179; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
15180; AVX2-FCP-NEXT:    vextractf128 $1, %ymm10, %xmm11
15181; AVX2-FCP-NEXT:    vmovaps 1888(%rdi), %ymm6
15182; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15183; AVX2-FCP-NEXT:    vmovaps 1856(%rdi), %ymm7
15184; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15185; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
15186; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15187; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2]
15188; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3]
15189; AVX2-FCP-NEXT:    vmovaps 2016(%rdi), %ymm11
15190; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15191; AVX2-FCP-NEXT:    vmovaps 1984(%rdi), %ymm6
15192; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15193; AVX2-FCP-NEXT:    vmovaps 1952(%rdi), %ymm7
15194; AVX2-FCP-NEXT:    vmovaps 1920(%rdi), %ymm9
15195; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15196; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
15197; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15198; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5]
15199; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2]
15200; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
15201; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15202; AVX2-FCP-NEXT:    vbroadcastss 148(%rdi), %ymm13
15203; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
15204; AVX2-FCP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15205; AVX2-FCP-NEXT:    # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
15206; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
15207; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm14
15208; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
15209; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5]
15210; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
15211; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7]
15212; AVX2-FCP-NEXT:    vextractf128 $1, %ymm15, %xmm15
15213; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
15214; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
15215; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15216; AVX2-FCP-NEXT:    vbroadcastss 404(%rdi), %ymm13
15217; AVX2-FCP-NEXT:    vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15218; AVX2-FCP-NEXT:    # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7]
15219; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
15220; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
15221; AVX2-FCP-NEXT:    vextractf128 $1, %ymm10, %xmm13
15222; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
15223; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5]
15224; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
15225; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
15226; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm14
15227; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
15228; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
15229; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15230; AVX2-FCP-NEXT:    vbroadcastss 660(%rdi), %ymm12
15231; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
15232; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7]
15233; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
15234; AVX2-FCP-NEXT:    vextractf128 $1, %ymm8, %xmm8
15235; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15236; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5]
15237; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
15238; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
15239; AVX2-FCP-NEXT:    vextractf128 $1, %ymm12, %xmm12
15240; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
15241; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
15242; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15243; AVX2-FCP-NEXT:    vbroadcastss 916(%rdi), %ymm4
15244; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15245; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7]
15246; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
15247; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm4
15248; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
15249; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5]
15250; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
15251; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7]
15252; AVX2-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
15253; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
15254; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
15255; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15256; AVX2-FCP-NEXT:    vbroadcastss 1172(%rdi), %ymm3
15257; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15258; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15259; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15260; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
15261; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
15262; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5]
15263; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
15264; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
15265; AVX2-FCP-NEXT:    vextractf128 $1, %ymm3, %xmm3
15266; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
15267; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
15268; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15269; AVX2-FCP-NEXT:    vbroadcastss 1428(%rdi), %ymm1
15270; AVX2-FCP-NEXT:    vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
15271; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7]
15272; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15273; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15274; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
15275; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
15276; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
15277; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
15278; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
15279; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
15280; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
15281; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15282; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15283; AVX2-FCP-NEXT:    vbroadcastss 1684(%rdi), %ymm0
15284; AVX2-FCP-NEXT:    vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15285; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7]
15286; AVX2-FCP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15287; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
15288; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15289; AVX2-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
15290; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15291; AVX2-FCP-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
15292; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
15293; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
15294; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
15295; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15296; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15297; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15298; AVX2-FCP-NEXT:    vbroadcastss 1940(%rdi), %ymm0
15299; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
15300; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
15301; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15302; AVX2-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
15303; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15304; AVX2-FCP-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
15305; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
15306; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
15307; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
15308; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15309; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15310; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15311; AVX2-FCP-NEXT:    vbroadcastss 248(%rdi), %ymm0
15312; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15313; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15314; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15315; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
15316; AVX2-FCP-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15317; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15318; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
15319; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15320; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15321; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15322; AVX2-FCP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15323; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15324; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm1
15325; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2]
15326; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15327; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
15328; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15329; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15330; AVX2-FCP-NEXT:    vbroadcastss 504(%rdi), %ymm0
15331; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15332; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15333; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15334; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15335; AVX2-FCP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15336; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15337; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7]
15338; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15339; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15340; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
15341; AVX2-FCP-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15342; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15343; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm1
15344; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
15345; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15346; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15347; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15348; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15349; AVX2-FCP-NEXT:    vbroadcastss 760(%rdi), %ymm0
15350; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15351; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15352; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15353; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15354; AVX2-FCP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15355; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15356; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
15357; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15358; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15359; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
15360; AVX2-FCP-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15361; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15362; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm1
15363; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
15364; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15365; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15366; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15367; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15368; AVX2-FCP-NEXT:    vbroadcastss 1016(%rdi), %ymm0
15369; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15370; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15371; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15372; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15373; AVX2-FCP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15374; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15375; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7]
15376; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15377; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15378; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
15379; AVX2-FCP-NEXT:    # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15380; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm1
15381; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2]
15382; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15383; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15384; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15385; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15386; AVX2-FCP-NEXT:    vbroadcastss 1272(%rdi), %ymm0
15387; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15388; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15389; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15390; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload
15391; AVX2-FCP-NEXT:    # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15392; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
15393; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15394; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
15395; AVX2-FCP-NEXT:    # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15396; AVX2-FCP-NEXT:    vextractf128 $1, %ymm13, %xmm1
15397; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2]
15398; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15399; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
15400; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15401; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15402; AVX2-FCP-NEXT:    vbroadcastss 1528(%rdi), %ymm0
15403; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15404; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15405; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15406; AVX2-FCP-NEXT:    vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
15407; AVX2-FCP-NEXT:    # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15408; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
15409; AVX2-FCP-NEXT:    # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
15410; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15411; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
15412; AVX2-FCP-NEXT:    # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15413; AVX2-FCP-NEXT:    vextractf128 $1, %ymm9, %xmm1
15414; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2]
15415; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15416; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
15417; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15418; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15419; AVX2-FCP-NEXT:    vbroadcastss 1784(%rdi), %ymm0
15420; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15421; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15422; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15423; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15424; AVX2-FCP-NEXT:    # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15425; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15426; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
15427; AVX2-FCP-NEXT:    # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15428; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15429; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
15430; AVX2-FCP-NEXT:    # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15431; AVX2-FCP-NEXT:    vextractf128 $1, %ymm7, %xmm1
15432; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2]
15433; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15434; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15435; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15436; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15437; AVX2-FCP-NEXT:    vbroadcastss 2040(%rdi), %ymm0
15438; AVX2-FCP-NEXT:    vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
15439; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
15440; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15441; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
15442; AVX2-FCP-NEXT:    # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15443; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15444; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
15445; AVX2-FCP-NEXT:    # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15446; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15447; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
15448; AVX2-FCP-NEXT:    # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15449; AVX2-FCP-NEXT:    vextractf128 $1, %ymm3, %xmm0
15450; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2]
15451; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
15452; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
15453; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15454; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
15455; AVX2-FCP-NEXT:    vbroadcastss 220(%rdi), %ymm0
15456; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15457; AVX2-FCP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15458; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15459; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15460; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15461; AVX2-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
15462; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
15463; AVX2-FCP-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
15464; AVX2-FCP-NEXT:    vextractf128 $1, %ymm15, %xmm15
15465; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
15466; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15467; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15468; AVX2-FCP-NEXT:    vbroadcastss 476(%rdi), %ymm0
15469; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15470; AVX2-FCP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15471; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15472; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15473; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15474; AVX2-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
15475; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
15476; AVX2-FCP-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
15477; AVX2-FCP-NEXT:    vextractf128 $1, %ymm15, %xmm15
15478; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
15479; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15480; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15481; AVX2-FCP-NEXT:    vbroadcastss 732(%rdi), %ymm0
15482; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15483; AVX2-FCP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15484; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15485; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15486; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15487; AVX2-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
15488; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
15489; AVX2-FCP-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
15490; AVX2-FCP-NEXT:    vextractf128 $1, %ymm15, %xmm15
15491; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
15492; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15493; AVX2-FCP-NEXT:    vbroadcastss 988(%rdi), %ymm0
15494; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15495; AVX2-FCP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15496; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15497; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15498; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm1
15499; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
15500; AVX2-FCP-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
15501; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm14
15502; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
15503; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15504; AVX2-FCP-NEXT:    vbroadcastss 1244(%rdi), %ymm0
15505; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15506; AVX2-FCP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15507; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
15508; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm1
15509; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7]
15510; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
15511; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
15512; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15513; AVX2-FCP-NEXT:    vbroadcastss 1500(%rdi), %ymm0
15514; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15515; AVX2-FCP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15516; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3]
15517; AVX2-FCP-NEXT:    vextractf128 $1, %ymm8, %xmm8
15518; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
15519; AVX2-FCP-NEXT:    vextractf128 $1, %ymm9, %xmm9
15520; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
15521; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
15522; AVX2-FCP-NEXT:    vbroadcastss 1756(%rdi), %ymm0
15523; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15524; AVX2-FCP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15525; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3]
15526; AVX2-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
15527; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7]
15528; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
15529; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
15530; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
15531; AVX2-FCP-NEXT:    vbroadcastss 2012(%rdi), %ymm0
15532; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15533; AVX2-FCP-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15534; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3]
15535; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
15536; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
15537; AVX2-FCP-NEXT:    vextractf128 $1, %ymm3, %xmm3
15538; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
15539; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
15540; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15541; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%rsi)
15542; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15543; AVX2-FCP-NEXT:    vmovaps %ymm2, 128(%rsi)
15544; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15545; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rsi)
15546; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15547; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rsi)
15548; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15549; AVX2-FCP-NEXT:    vmovaps %ymm2, 224(%rsi)
15550; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15551; AVX2-FCP-NEXT:    vmovaps %ymm2, 160(%rsi)
15552; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15553; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rsi)
15554; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15555; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rsi)
15556; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15557; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%rdx)
15558; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15559; AVX2-FCP-NEXT:    vmovaps %ymm2, 128(%rdx)
15560; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15561; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rdx)
15562; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15563; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rdx)
15564; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15565; AVX2-FCP-NEXT:    vmovaps %ymm2, 224(%rdx)
15566; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15567; AVX2-FCP-NEXT:    vmovaps %ymm2, 160(%rdx)
15568; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15569; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rdx)
15570; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15571; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rdx)
15572; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15573; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%rcx)
15574; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15575; AVX2-FCP-NEXT:    vmovaps %ymm2, 128(%rcx)
15576; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15577; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rcx)
15578; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15579; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rcx)
15580; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15581; AVX2-FCP-NEXT:    vmovaps %ymm2, 224(%rcx)
15582; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15583; AVX2-FCP-NEXT:    vmovaps %ymm2, 160(%rcx)
15584; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15585; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rcx)
15586; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15587; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rcx)
15588; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15589; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%r8)
15590; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15591; AVX2-FCP-NEXT:    vmovaps %ymm2, 128(%r8)
15592; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15593; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%r8)
15594; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15595; AVX2-FCP-NEXT:    vmovaps %ymm2, (%r8)
15596; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15597; AVX2-FCP-NEXT:    vmovaps %ymm2, 224(%r8)
15598; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15599; AVX2-FCP-NEXT:    vmovaps %ymm2, 160(%r8)
15600; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15601; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%r8)
15602; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15603; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%r8)
15604; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15605; AVX2-FCP-NEXT:    vmovaps %ymm2, 224(%r9)
15606; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15607; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%r9)
15608; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15609; AVX2-FCP-NEXT:    vmovaps %ymm2, 160(%r9)
15610; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15611; AVX2-FCP-NEXT:    vmovaps %ymm2, 128(%r9)
15612; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15613; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%r9)
15614; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15615; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%r9)
15616; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15617; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%r9)
15618; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15619; AVX2-FCP-NEXT:    vmovaps %ymm2, (%r9)
15620; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15621; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15622; AVX2-FCP-NEXT:    vmovaps %ymm2, 224(%rax)
15623; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15624; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%rax)
15625; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15626; AVX2-FCP-NEXT:    vmovaps %ymm2, 160(%rax)
15627; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15628; AVX2-FCP-NEXT:    vmovaps %ymm2, 128(%rax)
15629; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15630; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rax)
15631; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15632; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rax)
15633; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15634; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rax)
15635; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15636; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
15637; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15638; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm2 # 32-byte Reload
15639; AVX2-FCP-NEXT:    vmovaps %ymm2, 224(%rax)
15640; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15641; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%rax)
15642; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15643; AVX2-FCP-NEXT:    vmovaps %ymm2, 160(%rax)
15644; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15645; AVX2-FCP-NEXT:    vmovaps %ymm2, 128(%rax)
15646; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15647; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rax)
15648; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15649; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rax)
15650; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15651; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rax)
15652; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15653; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
15654; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15655; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
15656; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%rax)
15657; AVX2-FCP-NEXT:    vmovaps %ymm8, 160(%rax)
15658; AVX2-FCP-NEXT:    vmovaps %ymm1, 128(%rax)
15659; AVX2-FCP-NEXT:    vmovaps %ymm14, 96(%rax)
15660; AVX2-FCP-NEXT:    vmovaps %ymm15, 64(%rax)
15661; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15662; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
15663; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15664; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
15665; AVX2-FCP-NEXT:    addq $3528, %rsp # imm = 0xDC8
15666; AVX2-FCP-NEXT:    vzeroupper
15667; AVX2-FCP-NEXT:    retq
15668;
15669; AVX512-LABEL: load_i32_stride8_vf64:
15670; AVX512:       # %bb.0:
15671; AVX512-NEXT:    subq $3144, %rsp # imm = 0xC48
15672; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm11
15673; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15674; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm18
15675; AVX512-NEXT:    vmovdqa64 1600(%rdi), %zmm31
15676; AVX512-NEXT:    vmovaps 1536(%rdi), %zmm0
15677; AVX512-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
15678; AVX512-NEXT:    vmovdqa64 1728(%rdi), %zmm24
15679; AVX512-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15680; AVX512-NEXT:    vmovaps 1664(%rdi), %zmm0
15681; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15682; AVX512-NEXT:    vmovdqa64 1856(%rdi), %zmm21
15683; AVX512-NEXT:    vmovdqa64 1792(%rdi), %zmm26
15684; AVX512-NEXT:    vmovdqa64 1984(%rdi), %zmm22
15685; AVX512-NEXT:    vmovdqa64 1920(%rdi), %zmm5
15686; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm13
15687; AVX512-NEXT:    vmovdqa64 1024(%rdi), %zmm3
15688; AVX512-NEXT:    vmovdqa64 1216(%rdi), %zmm30
15689; AVX512-NEXT:    vmovdqa64 1152(%rdi), %zmm2
15690; AVX512-NEXT:    vmovdqa64 1344(%rdi), %zmm29
15691; AVX512-NEXT:    vmovdqa64 1280(%rdi), %zmm27
15692; AVX512-NEXT:    vmovdqa64 1472(%rdi), %zmm20
15693; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm10
15694; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm25
15695; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm7
15696; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm9
15697; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15698; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm12
15699; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm6
15700; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15701; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm28
15702; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm23
15703; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm4
15704; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
15705; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15706; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm1
15707; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm16
15708; AVX512-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
15709; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
15710; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
15711; AVX512-NEXT:    movb $-64, %al
15712; AVX512-NEXT:    kmovw %eax, %k1
15713; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15714; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm1
15715; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm15
15716; AVX512-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
15717; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm12
15718; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm9
15719; AVX512-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
15720; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15721; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15722; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15723; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm1
15724; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm6
15725; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15726; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
15727; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm4
15728; AVX512-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
15729; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15730; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1
15731; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm8
15732; AVX512-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
15733; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
15734; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm12
15735; AVX512-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
15736; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15737; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15738; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15739; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm14
15740; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15741; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
15742; AVX512-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
15743; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm17
15744; AVX512-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15745; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm4
15746; AVX512-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
15747; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15748; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15749; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm1
15750; AVX512-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
15751; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
15752; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm12
15753; AVX512-NEXT:    vmovdqa64 %zmm31, %zmm24
15754; AVX512-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
15755; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15756; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm5
15757; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15758; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15759; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15760; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
15761; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
15762; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm22
15763; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm4
15764; AVX512-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
15765; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15766; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm1
15767; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15768; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm5
15769; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15770; AVX512-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
15771; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm7
15772; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm19
15773; AVX512-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
15774; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15775; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15776; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
15777; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15778; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
15779; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15780; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm1
15781; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
15782; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm4
15783; AVX512-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15784; AVX512-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
15785; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15786; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15787; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm1
15788; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm31
15789; AVX512-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
15790; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm12
15791; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15792; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm30
15793; AVX512-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
15794; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15795; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15796; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15797; AVX512-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15798; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm1
15799; AVX512-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
15800; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
15801; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15802; AVX512-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
15803; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15804; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15805; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm1
15806; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
15807; AVX512-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
15808; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15809; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm12
15810; AVX512-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
15811; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15812; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15813; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15814; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm1
15815; AVX512-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
15816; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm4
15817; AVX512-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
15818; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15819; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm1
15820; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15821; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
15822; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm12
15823; AVX512-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
15824; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15825; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15826; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15827; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
15828; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm1
15829; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
15830; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm4
15831; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15832; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
15833; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15834; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
15835; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm1
15836; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
15837; AVX512-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
15838; AVX512-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
15839; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15840; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
15841; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15842; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
15843; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15844; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
15845; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm1
15846; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
15847; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm4
15848; AVX512-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
15849; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15850; AVX512-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
15851; AVX512-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
15852; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
15853; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15854; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15855; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm1
15856; AVX512-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
15857; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
15858; AVX512-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
15859; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15860; AVX512-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
15861; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm16
15862; AVX512-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
15863; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm29
15864; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
15865; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15866; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15867; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15868; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm1
15869; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
15870; AVX512-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
15871; AVX512-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15872; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm4
15873; AVX512-NEXT:    vmovdqa64 %zmm21, %zmm7
15874; AVX512-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
15875; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15876; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15877; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm1
15878; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm25
15879; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
15880; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
15881; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm12
15882; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm15
15883; AVX512-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
15884; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15885; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15886; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15887; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm13
15888; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm1
15889; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
15890; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm4
15891; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
15892; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15893; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm6
15894; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm1
15895; AVX512-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
15896; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm24
15897; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
15898; AVX512-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
15899; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15900; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
15901; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15902; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
15903; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15904; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm1
15905; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
15906; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm4
15907; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15908; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
15909; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15910; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15911; AVX512-NEXT:    vmovdqa64 %zmm31, %zmm21
15912; AVX512-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
15913; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
15914; AVX512-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
15915; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15916; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15917; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15918; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
15919; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm1
15920; AVX512-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15921; AVX512-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
15922; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
15923; AVX512-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
15924; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm31
15925; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15926; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15927; AVX512-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
15928; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
15929; AVX512-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
15930; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15931; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15932; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15933; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm1
15934; AVX512-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
15935; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm4
15936; AVX512-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
15937; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm26
15938; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15939; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm1
15940; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm9
15941; AVX512-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
15942; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm12
15943; AVX512-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
15944; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15945; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15946; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15947; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm1
15948; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
15949; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm25
15950; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm4
15951; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15952; AVX512-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
15953; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15954; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm1
15955; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm18
15956; AVX512-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
15957; AVX512-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
15958; AVX512-NEXT:    vmovdqa64 %zmm19, %zmm24
15959; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm13
15960; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15961; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
15962; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15963; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
15964; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15965; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm1
15966; AVX512-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15967; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
15968; AVX512-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15969; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm4
15970; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm3
15971; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
15972; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15973; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
15974; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm1
15975; AVX512-NEXT:    vmovdqa64 %zmm21, %zmm6
15976; AVX512-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15977; AVX512-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
15978; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
15979; AVX512-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15980; AVX512-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
15981; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15982; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15983; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15984; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm1
15985; AVX512-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
15986; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
15987; AVX512-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
15988; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
15989; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15990; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm1
15991; AVX512-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
15992; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15993; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm12
15994; AVX512-NEXT:    vmovdqa64 %zmm29, %zmm11
15995; AVX512-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
15996; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15997; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15998; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15999; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
16000; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm1
16001; AVX512-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
16002; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm31
16003; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16004; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm4
16005; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm19
16006; AVX512-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
16007; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16008; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16009; AVX512-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
16010; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm26
16011; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
16012; AVX512-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
16013; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm29
16014; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16015; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16016; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16017; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16018; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
16019; AVX512-NEXT:    vmovdqa64 %zmm21, %zmm1
16020; AVX512-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
16021; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm4
16022; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm14
16023; AVX512-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
16024; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16025; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16026; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm1
16027; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16028; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm5
16029; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm13
16030; AVX512-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16031; AVX512-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
16032; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16033; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16034; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16035; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
16036; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16037; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
16038; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
16039; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
16040; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm1
16041; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
16042; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16043; AVX512-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
16044; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16045; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
16046; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16047; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
16048; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm1
16049; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16050; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
16051; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
16052; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16053; AVX512-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
16054; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16055; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm1
16056; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm15
16057; AVX512-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
16058; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm8
16059; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm12
16060; AVX512-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
16061; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm2
16062; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16063; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16064; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16065; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm1
16066; AVX512-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
16067; AVX512-NEXT:    vmovdqa64 %zmm31, %zmm16
16068; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm4
16069; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm27
16070; AVX512-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
16071; AVX512-NEXT:    vmovdqa64 %zmm19, %zmm20
16072; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16073; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16074; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm1
16075; AVX512-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
16076; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm17
16077; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
16078; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm12
16079; AVX512-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
16080; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16081; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16082; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16083; AVX512-NEXT:    vmovdqa64 %zmm21, %zmm1
16084; AVX512-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
16085; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm4
16086; AVX512-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
16087; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16088; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm1
16089; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16090; AVX512-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
16091; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16092; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16093; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16094; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
16095; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16096; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm4
16097; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm30
16098; AVX512-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
16099; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
16100; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16101; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
16102; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm19
16103; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm11
16104; AVX512-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
16105; AVX512-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
16106; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm9
16107; AVX512-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
16108; AVX512-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
16109; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16110; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16111; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm8
16112; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
16113; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
16114; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16115; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm31
16116; AVX512-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
16117; AVX512-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
16118; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm16
16119; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm29
16120; AVX512-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
16121; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
16122; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm26
16123; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm2
16124; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm5
16125; AVX512-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
16126; AVX512-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
16127; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16128; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm4
16129; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16130; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
16131; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
16132; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm15
16133; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16134; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm23
16135; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16136; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
16137; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
16138; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm17
16139; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16140; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm24
16141; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16142; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
16143; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
16144; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm27
16145; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16146; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm13
16147; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16148; AVX512-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
16149; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
16150; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16151; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm3
16152; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16153; AVX512-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
16154; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
16155; AVX512-NEXT:    vmovdqa64 %zmm21, %zmm6
16156; AVX512-NEXT:    vmovdqa64 %zmm21, %zmm20
16157; AVX512-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
16158; AVX512-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
16159; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm21
16160; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm25
16161; AVX512-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
16162; AVX512-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
16163; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16164; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm14
16165; AVX512-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
16166; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
16167; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16168; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16169; AVX512-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
16170; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
16171; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm6
16172; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
16173; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
16174; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
16175; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
16176; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
16177; AVX512-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16178; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
16179; AVX512-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
16180; AVX512-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
16181; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
16182; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
16183; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
16184; AVX512-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
16185; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
16186; AVX512-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
16187; AVX512-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
16188; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
16189; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
16190; AVX512-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
16191; AVX512-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
16192; AVX512-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
16193; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
16194; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
16195; AVX512-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
16196; AVX512-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
16197; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
16198; AVX512-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
16199; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16200; AVX512-NEXT:    vmovaps %zmm8, 192(%rsi)
16201; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16202; AVX512-NEXT:    vmovaps %zmm8, 128(%rsi)
16203; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16204; AVX512-NEXT:    vmovaps %zmm8, 64(%rsi)
16205; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16206; AVX512-NEXT:    vmovaps %zmm6, (%rsi)
16207; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16208; AVX512-NEXT:    vmovaps %zmm6, 192(%rdx)
16209; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16210; AVX512-NEXT:    vmovaps %zmm6, (%rdx)
16211; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16212; AVX512-NEXT:    vmovaps %zmm6, 64(%rdx)
16213; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16214; AVX512-NEXT:    vmovaps %zmm6, 128(%rdx)
16215; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16216; AVX512-NEXT:    vmovaps %zmm6, 192(%rcx)
16217; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16218; AVX512-NEXT:    vmovaps %zmm6, (%rcx)
16219; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16220; AVX512-NEXT:    vmovaps %zmm6, 64(%rcx)
16221; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16222; AVX512-NEXT:    vmovaps %zmm6, 128(%rcx)
16223; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16224; AVX512-NEXT:    vmovaps %zmm6, 192(%r8)
16225; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16226; AVX512-NEXT:    vmovaps %zmm6, (%r8)
16227; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16228; AVX512-NEXT:    vmovaps %zmm6, 64(%r8)
16229; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16230; AVX512-NEXT:    vmovaps %zmm6, 128(%r8)
16231; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16232; AVX512-NEXT:    vmovaps %zmm6, 192(%r9)
16233; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16234; AVX512-NEXT:    vmovaps %zmm6, (%r9)
16235; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16236; AVX512-NEXT:    vmovaps %zmm6, 64(%r9)
16237; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16238; AVX512-NEXT:    vmovaps %zmm6, 128(%r9)
16239; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16240; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16241; AVX512-NEXT:    vmovaps %zmm6, 192(%rax)
16242; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16243; AVX512-NEXT:    vmovaps %zmm6, (%rax)
16244; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16245; AVX512-NEXT:    vmovaps %zmm6, 64(%rax)
16246; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16247; AVX512-NEXT:    vmovaps %zmm6, 128(%rax)
16248; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16249; AVX512-NEXT:    vmovdqa64 %zmm3, 192(%rax)
16250; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
16251; AVX512-NEXT:    vmovdqa64 %zmm2, 64(%rax)
16252; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%rax)
16253; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16254; AVX512-NEXT:    vmovdqa64 %zmm9, 128(%rax)
16255; AVX512-NEXT:    vmovdqa64 %zmm5, 192(%rax)
16256; AVX512-NEXT:    vmovdqa64 %zmm7, (%rax)
16257; AVX512-NEXT:    vmovdqa64 %zmm4, 64(%rax)
16258; AVX512-NEXT:    addq $3144, %rsp # imm = 0xC48
16259; AVX512-NEXT:    vzeroupper
16260; AVX512-NEXT:    retq
16261;
16262; AVX512-FCP-LABEL: load_i32_stride8_vf64:
16263; AVX512-FCP:       # %bb.0:
16264; AVX512-FCP-NEXT:    subq $3144, %rsp # imm = 0xC48
16265; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm11
16266; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16267; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm18
16268; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
16269; AVX512-FCP-NEXT:    vmovaps 1536(%rdi), %zmm0
16270; AVX512-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
16271; AVX512-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm24
16272; AVX512-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16273; AVX512-FCP-NEXT:    vmovaps 1664(%rdi), %zmm0
16274; AVX512-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16275; AVX512-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm21
16276; AVX512-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm26
16277; AVX512-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm22
16278; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm5
16279; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
16280; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm3
16281; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm30
16282; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
16283; AVX512-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm29
16284; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
16285; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
16286; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
16287; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm25
16288; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
16289; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
16290; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16291; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
16292; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
16293; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16294; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
16295; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
16296; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
16297; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
16298; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16299; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
16300; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
16301; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
16302; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
16303; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
16304; AVX512-FCP-NEXT:    movb $-64, %al
16305; AVX512-FCP-NEXT:    kmovw %eax, %k1
16306; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16307; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
16308; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
16309; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
16310; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
16311; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
16312; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
16313; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16314; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16315; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16316; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
16317; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm6
16318; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16319; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
16320; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
16321; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
16322; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16323; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
16324; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
16325; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
16326; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
16327; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
16328; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
16329; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16330; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16331; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16332; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
16333; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16334; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
16335; AVX512-FCP-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
16336; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
16337; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16338; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
16339; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
16340; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16341; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16342; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
16343; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
16344; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
16345; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
16346; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
16347; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
16348; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16349; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
16350; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16351; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16352; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16353; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
16354; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16355; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
16356; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
16357; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
16358; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16359; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
16360; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16361; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
16362; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16363; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
16364; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
16365; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm19
16366; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
16367; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16368; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16369; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16370; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16371; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
16372; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16373; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
16374; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
16375; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
16376; AVX512-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16377; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
16378; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16379; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16380; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
16381; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm31
16382; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
16383; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm12
16384; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16385; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm30
16386; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
16387; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16388; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16389; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16390; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16391; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
16392; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
16393; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
16394; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16395; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
16396; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16397; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16398; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
16399; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
16400; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
16401; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16402; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
16403; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
16404; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16405; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16406; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16407; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
16408; AVX512-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
16409; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
16410; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
16411; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16412; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
16413; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16414; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
16415; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
16416; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
16417; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16418; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16419; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16420; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16421; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
16422; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16423; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
16424; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16425; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
16426; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16427; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
16428; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
16429; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
16430; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
16431; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
16432; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16433; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16434; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16435; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
16436; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16437; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
16438; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
16439; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
16440; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
16441; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
16442; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16443; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
16444; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
16445; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
16446; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16447; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16448; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
16449; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
16450; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
16451; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
16452; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16453; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
16454; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm16
16455; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
16456; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm29
16457; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
16458; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16459; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16460; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16461; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
16462; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16463; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
16464; AVX512-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16465; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
16466; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm7
16467; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
16468; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16469; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16470; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
16471; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm25
16472; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
16473; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
16474; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
16475; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm15
16476; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
16477; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16478; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16479; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16480; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
16481; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
16482; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16483; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
16484; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
16485; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16486; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm6
16487; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
16488; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
16489; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm24
16490; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
16491; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
16492; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16493; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16494; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16495; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
16496; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16497; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
16498; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
16499; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
16500; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16501; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
16502; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16503; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16504; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm21
16505; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
16506; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16507; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
16508; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16509; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16510; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16511; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
16512; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm1
16513; AVX512-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16514; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
16515; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
16516; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
16517; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm31
16518; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16519; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16520; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
16521; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16522; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
16523; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16524; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16525; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16526; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
16527; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
16528; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
16529; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
16530; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm26
16531; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16532; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
16533; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
16534; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
16535; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
16536; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
16537; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16538; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16539; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16540; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
16541; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16542; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm25
16543; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
16544; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16545; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
16546; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16547; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
16548; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm18
16549; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
16550; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
16551; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm24
16552; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
16553; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16554; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16555; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16556; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
16557; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16558; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
16559; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16560; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
16561; AVX512-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16562; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
16563; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
16564; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
16565; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16566; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16567; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
16568; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
16569; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16570; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
16571; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16572; AVX512-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16573; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
16574; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16575; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16576; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16577; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm1
16578; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
16579; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
16580; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
16581; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16582; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16583; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
16584; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
16585; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16586; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
16587; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm11
16588; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
16589; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16590; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16591; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16592; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
16593; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
16594; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
16595; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm31
16596; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16597; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
16598; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm19
16599; AVX512-FCP-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
16600; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16601; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16602; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
16603; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm26
16604; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
16605; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
16606; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm29
16607; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16608; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16609; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16610; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16611; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
16612; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
16613; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
16614; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
16615; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
16616; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
16617; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16618; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16619; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
16620; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16621; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm5
16622; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm13
16623; AVX512-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16624; AVX512-FCP-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
16625; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16626; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16627; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16628; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
16629; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16630; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
16631; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
16632; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
16633; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
16634; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
16635; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16636; AVX512-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
16637; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16638; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
16639; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16640; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
16641; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm1
16642; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16643; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
16644; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
16645; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16646; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
16647; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16648; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
16649; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm15
16650; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
16651; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm8
16652; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
16653; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
16654; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
16655; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16656; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16657; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16658; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
16659; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
16660; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm16
16661; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
16662; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm27
16663; AVX512-FCP-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
16664; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm20
16665; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16666; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16667; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
16668; AVX512-FCP-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
16669; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm17
16670; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
16671; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
16672; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
16673; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16674; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16675; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16676; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
16677; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
16678; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
16679; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
16680; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16681; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
16682; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16683; AVX512-FCP-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
16684; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16685; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16686; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16687; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
16688; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16689; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm4
16690; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm30
16691; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
16692; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
16693; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16694; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
16695; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm19
16696; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
16697; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
16698; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
16699; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
16700; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
16701; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
16702; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16703; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16704; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm8
16705; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
16706; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
16707; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16708; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm31
16709; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
16710; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
16711; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm16
16712; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm29
16713; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
16714; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
16715; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm26
16716; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
16717; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm5
16718; AVX512-FCP-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
16719; AVX512-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
16720; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16721; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
16722; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16723; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
16724; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
16725; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm15
16726; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16727; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm23
16728; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16729; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
16730; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
16731; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm17
16732; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16733; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm24
16734; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16735; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
16736; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
16737; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm27
16738; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16739; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm13
16740; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16741; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
16742; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
16743; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16744; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm3
16745; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16746; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
16747; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
16748; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
16749; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20
16750; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
16751; AVX512-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
16752; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm21
16753; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm25
16754; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
16755; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
16756; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16757; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm14
16758; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
16759; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
16760; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16761; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16762; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
16763; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
16764; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6
16765; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
16766; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
16767; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
16768; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
16769; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
16770; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16771; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
16772; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
16773; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
16774; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
16775; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
16776; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
16777; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
16778; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
16779; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
16780; AVX512-FCP-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
16781; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
16782; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
16783; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
16784; AVX512-FCP-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
16785; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
16786; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
16787; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
16788; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
16789; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
16790; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
16791; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
16792; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16793; AVX512-FCP-NEXT:    vmovaps %zmm8, 192(%rsi)
16794; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16795; AVX512-FCP-NEXT:    vmovaps %zmm8, 128(%rsi)
16796; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16797; AVX512-FCP-NEXT:    vmovaps %zmm8, 64(%rsi)
16798; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16799; AVX512-FCP-NEXT:    vmovaps %zmm6, (%rsi)
16800; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16801; AVX512-FCP-NEXT:    vmovaps %zmm6, 192(%rdx)
16802; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16803; AVX512-FCP-NEXT:    vmovaps %zmm6, (%rdx)
16804; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16805; AVX512-FCP-NEXT:    vmovaps %zmm6, 64(%rdx)
16806; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16807; AVX512-FCP-NEXT:    vmovaps %zmm6, 128(%rdx)
16808; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16809; AVX512-FCP-NEXT:    vmovaps %zmm6, 192(%rcx)
16810; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16811; AVX512-FCP-NEXT:    vmovaps %zmm6, (%rcx)
16812; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16813; AVX512-FCP-NEXT:    vmovaps %zmm6, 64(%rcx)
16814; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16815; AVX512-FCP-NEXT:    vmovaps %zmm6, 128(%rcx)
16816; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16817; AVX512-FCP-NEXT:    vmovaps %zmm6, 192(%r8)
16818; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16819; AVX512-FCP-NEXT:    vmovaps %zmm6, (%r8)
16820; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16821; AVX512-FCP-NEXT:    vmovaps %zmm6, 64(%r8)
16822; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16823; AVX512-FCP-NEXT:    vmovaps %zmm6, 128(%r8)
16824; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16825; AVX512-FCP-NEXT:    vmovaps %zmm6, 192(%r9)
16826; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16827; AVX512-FCP-NEXT:    vmovaps %zmm6, (%r9)
16828; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16829; AVX512-FCP-NEXT:    vmovaps %zmm6, 64(%r9)
16830; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16831; AVX512-FCP-NEXT:    vmovaps %zmm6, 128(%r9)
16832; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16833; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16834; AVX512-FCP-NEXT:    vmovaps %zmm6, 192(%rax)
16835; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16836; AVX512-FCP-NEXT:    vmovaps %zmm6, (%rax)
16837; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16838; AVX512-FCP-NEXT:    vmovaps %zmm6, 64(%rax)
16839; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16840; AVX512-FCP-NEXT:    vmovaps %zmm6, 128(%rax)
16841; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16842; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
16843; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
16844; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
16845; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
16846; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16847; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
16848; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
16849; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
16850; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
16851; AVX512-FCP-NEXT:    addq $3144, %rsp # imm = 0xC48
16852; AVX512-FCP-NEXT:    vzeroupper
16853; AVX512-FCP-NEXT:    retq
16854;
16855; AVX512DQ-LABEL: load_i32_stride8_vf64:
16856; AVX512DQ:       # %bb.0:
16857; AVX512DQ-NEXT:    subq $3144, %rsp # imm = 0xC48
16858; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm11
16859; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16860; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm18
16861; AVX512DQ-NEXT:    vmovdqa64 1600(%rdi), %zmm31
16862; AVX512DQ-NEXT:    vmovaps 1536(%rdi), %zmm0
16863; AVX512DQ-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
16864; AVX512DQ-NEXT:    vmovdqa64 1728(%rdi), %zmm24
16865; AVX512DQ-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16866; AVX512DQ-NEXT:    vmovaps 1664(%rdi), %zmm0
16867; AVX512DQ-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16868; AVX512DQ-NEXT:    vmovdqa64 1856(%rdi), %zmm21
16869; AVX512DQ-NEXT:    vmovdqa64 1792(%rdi), %zmm26
16870; AVX512DQ-NEXT:    vmovdqa64 1984(%rdi), %zmm22
16871; AVX512DQ-NEXT:    vmovdqa64 1920(%rdi), %zmm5
16872; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm13
16873; AVX512DQ-NEXT:    vmovdqa64 1024(%rdi), %zmm3
16874; AVX512DQ-NEXT:    vmovdqa64 1216(%rdi), %zmm30
16875; AVX512DQ-NEXT:    vmovdqa64 1152(%rdi), %zmm2
16876; AVX512DQ-NEXT:    vmovdqa64 1344(%rdi), %zmm29
16877; AVX512DQ-NEXT:    vmovdqa64 1280(%rdi), %zmm27
16878; AVX512DQ-NEXT:    vmovdqa64 1472(%rdi), %zmm20
16879; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm10
16880; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm25
16881; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm7
16882; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm9
16883; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16884; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm12
16885; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm6
16886; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16887; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm28
16888; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm23
16889; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm4
16890; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
16891; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16892; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm1
16893; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm16
16894; AVX512DQ-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
16895; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
16896; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
16897; AVX512DQ-NEXT:    movb $-64, %al
16898; AVX512DQ-NEXT:    kmovw %eax, %k1
16899; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16900; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm1
16901; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm15
16902; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
16903; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm12
16904; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm9
16905; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
16906; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16907; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16908; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16909; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm1
16910; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm6
16911; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16912; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
16913; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm4
16914; AVX512DQ-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
16915; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16916; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm1
16917; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm8
16918; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
16919; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
16920; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm12
16921; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
16922; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16923; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16924; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16925; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm14
16926; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16927; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1
16928; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
16929; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm17
16930; AVX512DQ-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16931; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm4
16932; AVX512DQ-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
16933; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16934; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16935; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm1
16936; AVX512DQ-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
16937; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
16938; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm12
16939; AVX512DQ-NEXT:    vmovdqa64 %zmm31, %zmm24
16940; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
16941; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16942; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm5
16943; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16944; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16945; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16946; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1
16947; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
16948; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm22
16949; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm4
16950; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
16951; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16952; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm1
16953; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16954; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm5
16955; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16956; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
16957; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm7
16958; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm19
16959; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
16960; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16961; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16962; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16963; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16964; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
16965; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16966; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm1
16967; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
16968; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm4
16969; AVX512DQ-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16970; AVX512DQ-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
16971; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16972; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16973; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm1
16974; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm31
16975; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
16976; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm12
16977; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16978; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm30
16979; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
16980; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16981; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16982; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16983; AVX512DQ-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16984; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm1
16985; AVX512DQ-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
16986; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
16987; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16988; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
16989; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
16990; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16991; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm1
16992; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
16993; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
16994; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16995; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm12
16996; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
16997; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16998; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16999; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17000; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm1
17001; AVX512DQ-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
17002; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm4
17003; AVX512DQ-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
17004; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17005; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm1
17006; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17007; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
17008; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm12
17009; AVX512DQ-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
17010; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17011; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17012; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17013; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17014; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm1
17015; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17016; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm4
17017; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17018; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
17019; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17020; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17021; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm1
17022; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17023; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
17024; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
17025; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17026; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17027; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17028; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
17029; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17030; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
17031; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm1
17032; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
17033; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm4
17034; AVX512DQ-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
17035; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17036; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
17037; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
17038; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
17039; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17040; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17041; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm1
17042; AVX512DQ-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
17043; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
17044; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17045; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17046; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
17047; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm16
17048; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
17049; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm29
17050; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
17051; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17052; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17053; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17054; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm1
17055; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17056; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
17057; AVX512DQ-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17058; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm4
17059; AVX512DQ-NEXT:    vmovdqa64 %zmm21, %zmm7
17060; AVX512DQ-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
17061; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17062; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17063; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm1
17064; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm25
17065; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
17066; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
17067; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm12
17068; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm15
17069; AVX512DQ-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
17070; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17071; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17072; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17073; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm13
17074; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm1
17075; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17076; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm4
17077; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
17078; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17079; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm6
17080; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm1
17081; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
17082; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm24
17083; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17084; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
17085; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17086; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17087; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17088; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
17089; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17090; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm1
17091; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
17092; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm4
17093; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17094; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
17095; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17096; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17097; AVX512DQ-NEXT:    vmovdqa64 %zmm31, %zmm21
17098; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
17099; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17100; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
17101; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17102; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17103; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17104; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17105; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm1
17106; AVX512DQ-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17107; AVX512DQ-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
17108; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
17109; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17110; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm31
17111; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17112; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17113; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
17114; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17115; AVX512DQ-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
17116; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17117; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17118; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17119; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm1
17120; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
17121; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm4
17122; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
17123; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm26
17124; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17125; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm1
17126; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm9
17127; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
17128; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm12
17129; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
17130; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17131; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17132; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17133; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm1
17134; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17135; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm25
17136; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm4
17137; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17138; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17139; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17140; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm1
17141; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm18
17142; AVX512DQ-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
17143; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
17144; AVX512DQ-NEXT:    vmovdqa64 %zmm19, %zmm24
17145; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm13
17146; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17147; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17148; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17149; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
17150; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17151; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm1
17152; AVX512DQ-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17153; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
17154; AVX512DQ-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17155; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm4
17156; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm3
17157; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
17158; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17159; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17160; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm1
17161; AVX512DQ-NEXT:    vmovdqa64 %zmm21, %zmm6
17162; AVX512DQ-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17163; AVX512DQ-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
17164; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17165; AVX512DQ-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17166; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
17167; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17168; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17169; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17170; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm1
17171; AVX512DQ-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
17172; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
17173; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
17174; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17175; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17176; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm1
17177; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
17178; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17179; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm12
17180; AVX512DQ-NEXT:    vmovdqa64 %zmm29, %zmm11
17181; AVX512DQ-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
17182; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17183; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17184; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17185; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
17186; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm1
17187; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
17188; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm31
17189; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17190; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm4
17191; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm19
17192; AVX512DQ-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
17193; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17194; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17195; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
17196; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm26
17197; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
17198; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
17199; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm29
17200; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17201; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17202; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17203; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17204; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
17205; AVX512DQ-NEXT:    vmovdqa64 %zmm21, %zmm1
17206; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
17207; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm4
17208; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm14
17209; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17210; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17211; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17212; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm1
17213; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17214; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm5
17215; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm13
17216; AVX512DQ-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17217; AVX512DQ-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
17218; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17219; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17220; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17221; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
17222; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17223; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
17224; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
17225; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
17226; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm1
17227; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
17228; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17229; AVX512DQ-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
17230; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17231; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
17232; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17233; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
17234; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm1
17235; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17236; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
17237; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
17238; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17239; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
17240; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17241; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm1
17242; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm15
17243; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
17244; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm8
17245; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm12
17246; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
17247; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm2
17248; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17249; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17250; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17251; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm1
17252; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
17253; AVX512DQ-NEXT:    vmovdqa64 %zmm31, %zmm16
17254; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm4
17255; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm27
17256; AVX512DQ-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
17257; AVX512DQ-NEXT:    vmovdqa64 %zmm19, %zmm20
17258; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17259; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17260; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm1
17261; AVX512DQ-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
17262; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm17
17263; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
17264; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm12
17265; AVX512DQ-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
17266; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17267; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17268; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17269; AVX512DQ-NEXT:    vmovdqa64 %zmm21, %zmm1
17270; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
17271; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm4
17272; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
17273; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17274; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm1
17275; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17276; AVX512DQ-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
17277; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17278; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17279; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17280; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
17281; AVX512DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17282; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm4
17283; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm30
17284; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
17285; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
17286; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17287; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
17288; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm19
17289; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm11
17290; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
17291; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
17292; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm9
17293; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
17294; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
17295; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17296; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17297; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm8
17298; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
17299; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
17300; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17301; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm31
17302; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
17303; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
17304; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm16
17305; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm29
17306; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
17307; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
17308; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm26
17309; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm2
17310; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm5
17311; AVX512DQ-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
17312; AVX512DQ-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
17313; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17314; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm4
17315; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17316; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
17317; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
17318; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm15
17319; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17320; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm23
17321; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17322; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
17323; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
17324; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm17
17325; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17326; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm24
17327; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17328; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
17329; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
17330; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm27
17331; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17332; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm13
17333; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17334; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
17335; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
17336; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17337; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm3
17338; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17339; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
17340; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
17341; AVX512DQ-NEXT:    vmovdqa64 %zmm21, %zmm6
17342; AVX512DQ-NEXT:    vmovdqa64 %zmm21, %zmm20
17343; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
17344; AVX512DQ-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
17345; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm21
17346; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm25
17347; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
17348; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
17349; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17350; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm14
17351; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
17352; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
17353; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17354; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17355; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
17356; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
17357; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm6
17358; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
17359; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
17360; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
17361; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
17362; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
17363; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
17364; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
17365; AVX512DQ-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
17366; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
17367; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
17368; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
17369; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
17370; AVX512DQ-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
17371; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
17372; AVX512DQ-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
17373; AVX512DQ-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
17374; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
17375; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
17376; AVX512DQ-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
17377; AVX512DQ-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
17378; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
17379; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
17380; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
17381; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
17382; AVX512DQ-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
17383; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
17384; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
17385; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17386; AVX512DQ-NEXT:    vmovaps %zmm8, 192(%rsi)
17387; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17388; AVX512DQ-NEXT:    vmovaps %zmm8, 128(%rsi)
17389; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17390; AVX512DQ-NEXT:    vmovaps %zmm8, 64(%rsi)
17391; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17392; AVX512DQ-NEXT:    vmovaps %zmm6, (%rsi)
17393; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17394; AVX512DQ-NEXT:    vmovaps %zmm6, 192(%rdx)
17395; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17396; AVX512DQ-NEXT:    vmovaps %zmm6, (%rdx)
17397; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17398; AVX512DQ-NEXT:    vmovaps %zmm6, 64(%rdx)
17399; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17400; AVX512DQ-NEXT:    vmovaps %zmm6, 128(%rdx)
17401; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17402; AVX512DQ-NEXT:    vmovaps %zmm6, 192(%rcx)
17403; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17404; AVX512DQ-NEXT:    vmovaps %zmm6, (%rcx)
17405; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17406; AVX512DQ-NEXT:    vmovaps %zmm6, 64(%rcx)
17407; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17408; AVX512DQ-NEXT:    vmovaps %zmm6, 128(%rcx)
17409; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17410; AVX512DQ-NEXT:    vmovaps %zmm6, 192(%r8)
17411; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17412; AVX512DQ-NEXT:    vmovaps %zmm6, (%r8)
17413; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17414; AVX512DQ-NEXT:    vmovaps %zmm6, 64(%r8)
17415; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17416; AVX512DQ-NEXT:    vmovaps %zmm6, 128(%r8)
17417; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17418; AVX512DQ-NEXT:    vmovaps %zmm6, 192(%r9)
17419; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17420; AVX512DQ-NEXT:    vmovaps %zmm6, (%r9)
17421; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17422; AVX512DQ-NEXT:    vmovaps %zmm6, 64(%r9)
17423; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17424; AVX512DQ-NEXT:    vmovaps %zmm6, 128(%r9)
17425; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
17426; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17427; AVX512DQ-NEXT:    vmovaps %zmm6, 192(%rax)
17428; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17429; AVX512DQ-NEXT:    vmovaps %zmm6, (%rax)
17430; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17431; AVX512DQ-NEXT:    vmovaps %zmm6, 64(%rax)
17432; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17433; AVX512DQ-NEXT:    vmovaps %zmm6, 128(%rax)
17434; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
17435; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rax)
17436; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
17437; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rax)
17438; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
17439; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
17440; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 128(%rax)
17441; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 192(%rax)
17442; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rax)
17443; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rax)
17444; AVX512DQ-NEXT:    addq $3144, %rsp # imm = 0xC48
17445; AVX512DQ-NEXT:    vzeroupper
17446; AVX512DQ-NEXT:    retq
17447;
17448; AVX512DQ-FCP-LABEL: load_i32_stride8_vf64:
17449; AVX512DQ-FCP:       # %bb.0:
17450; AVX512DQ-FCP-NEXT:    subq $3144, %rsp # imm = 0xC48
17451; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm11
17452; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17453; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm18
17454; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
17455; AVX512DQ-FCP-NEXT:    vmovaps 1536(%rdi), %zmm0
17456; AVX512DQ-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
17457; AVX512DQ-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm24
17458; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17459; AVX512DQ-FCP-NEXT:    vmovaps 1664(%rdi), %zmm0
17460; AVX512DQ-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17461; AVX512DQ-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm21
17462; AVX512DQ-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm26
17463; AVX512DQ-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm22
17464; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm5
17465; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
17466; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm3
17467; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm30
17468; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
17469; AVX512DQ-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm29
17470; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
17471; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
17472; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
17473; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm25
17474; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
17475; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
17476; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17477; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
17478; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
17479; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17480; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
17481; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
17482; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
17483; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
17484; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17485; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
17486; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
17487; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
17488; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
17489; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
17490; AVX512DQ-FCP-NEXT:    movb $-64, %al
17491; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
17492; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17493; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
17494; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
17495; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
17496; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
17497; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
17498; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
17499; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17500; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17501; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17502; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
17503; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm6
17504; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17505; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
17506; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
17507; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
17508; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17509; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
17510; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
17511; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
17512; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
17513; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
17514; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
17515; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17516; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17517; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17518; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
17519; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17520; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
17521; AVX512DQ-FCP-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
17522; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
17523; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17524; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
17525; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
17526; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17527; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17528; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
17529; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
17530; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
17531; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
17532; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
17533; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
17534; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17535; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
17536; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17537; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17538; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17539; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
17540; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17541; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
17542; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
17543; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
17544; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17545; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
17546; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17547; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
17548; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17549; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
17550; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
17551; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm19
17552; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
17553; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17554; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17555; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17556; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17557; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
17558; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17559; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
17560; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
17561; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
17562; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17563; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
17564; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17565; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17566; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
17567; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm31
17568; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
17569; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm12
17570; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17571; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm30
17572; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
17573; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17574; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17575; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17576; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17577; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
17578; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
17579; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
17580; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17581; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17582; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17583; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17584; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
17585; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
17586; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
17587; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17588; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
17589; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
17590; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17591; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17592; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17593; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
17594; AVX512DQ-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
17595; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
17596; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
17597; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17598; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
17599; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17600; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
17601; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
17602; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
17603; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17604; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17605; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17606; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17607; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
17608; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17609; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
17610; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17611; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
17612; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17613; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17614; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
17615; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17616; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
17617; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
17618; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17619; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17620; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17621; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
17622; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17623; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
17624; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
17625; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
17626; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
17627; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
17628; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17629; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
17630; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
17631; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
17632; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17633; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17634; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
17635; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
17636; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
17637; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17638; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17639; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
17640; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm16
17641; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
17642; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm29
17643; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
17644; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17645; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17646; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17647; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
17648; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17649; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
17650; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17651; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
17652; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm7
17653; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
17654; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17655; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17656; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
17657; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm25
17658; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
17659; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
17660; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
17661; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm15
17662; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
17663; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17664; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17665; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17666; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
17667; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
17668; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17669; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
17670; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
17671; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17672; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm6
17673; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
17674; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
17675; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm24
17676; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17677; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
17678; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17679; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17680; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17681; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
17682; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17683; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
17684; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
17685; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
17686; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17687; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
17688; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17689; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17690; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm21
17691; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
17692; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17693; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
17694; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17695; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17696; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17697; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17698; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm1
17699; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17700; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
17701; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
17702; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17703; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm31
17704; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17705; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17706; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
17707; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17708; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
17709; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17710; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17711; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17712; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
17713; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
17714; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
17715; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
17716; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm26
17717; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17718; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
17719; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
17720; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
17721; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
17722; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
17723; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17724; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17725; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17726; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
17727; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17728; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm25
17729; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
17730; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17731; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17732; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17733; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
17734; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm18
17735; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
17736; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
17737; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm24
17738; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
17739; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17740; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17741; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17742; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
17743; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17744; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
17745; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17746; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
17747; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17748; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
17749; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
17750; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
17751; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17752; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17753; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
17754; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
17755; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17756; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
17757; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17758; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17759; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
17760; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17761; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17762; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17763; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm1
17764; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
17765; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
17766; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
17767; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17768; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17769; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
17770; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
17771; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17772; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
17773; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm11
17774; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
17775; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17776; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17777; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17778; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
17779; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
17780; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
17781; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm31
17782; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17783; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
17784; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm19
17785; AVX512DQ-FCP-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
17786; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17787; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17788; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
17789; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm26
17790; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
17791; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
17792; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm29
17793; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17794; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17795; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17796; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17797; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
17798; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
17799; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
17800; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
17801; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
17802; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
17803; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17804; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17805; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
17806; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17807; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm5
17808; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm13
17809; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17810; AVX512DQ-FCP-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
17811; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17812; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17813; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17814; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
17815; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17816; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
17817; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
17818; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
17819; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
17820; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
17821; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17822; AVX512DQ-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
17823; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17824; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
17825; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17826; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
17827; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm1
17828; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17829; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
17830; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
17831; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17832; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
17833; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17834; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
17835; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm15
17836; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
17837; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm8
17838; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
17839; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
17840; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
17841; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17842; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17843; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17844; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
17845; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
17846; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm16
17847; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
17848; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm27
17849; AVX512DQ-FCP-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
17850; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm20
17851; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17852; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17853; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
17854; AVX512DQ-FCP-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
17855; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm17
17856; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
17857; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
17858; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
17859; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17860; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17861; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17862; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
17863; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
17864; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
17865; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
17866; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
17867; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
17868; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
17869; AVX512DQ-FCP-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
17870; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17871; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17872; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17873; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
17874; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17875; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm4
17876; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm30
17877; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
17878; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
17879; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17880; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
17881; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm19
17882; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
17883; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
17884; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
17885; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
17886; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
17887; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
17888; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17889; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17890; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm8
17891; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
17892; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
17893; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17894; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm31
17895; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
17896; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
17897; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm16
17898; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm29
17899; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
17900; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
17901; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm26
17902; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
17903; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm5
17904; AVX512DQ-FCP-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
17905; AVX512DQ-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
17906; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17907; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
17908; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17909; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
17910; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
17911; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm15
17912; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17913; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm23
17914; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17915; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
17916; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
17917; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm17
17918; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17919; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm24
17920; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17921; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
17922; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
17923; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm27
17924; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17925; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm13
17926; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17927; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
17928; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
17929; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17930; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm3
17931; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17932; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
17933; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
17934; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
17935; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20
17936; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
17937; AVX512DQ-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
17938; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm21
17939; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm25
17940; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
17941; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
17942; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17943; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm14
17944; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
17945; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
17946; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17947; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17948; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
17949; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
17950; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6
17951; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
17952; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
17953; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
17954; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
17955; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
17956; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
17957; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
17958; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
17959; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
17960; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
17961; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
17962; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
17963; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
17964; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
17965; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
17966; AVX512DQ-FCP-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
17967; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
17968; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
17969; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
17970; AVX512DQ-FCP-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
17971; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
17972; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
17973; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
17974; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
17975; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
17976; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
17977; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
17978; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17979; AVX512DQ-FCP-NEXT:    vmovaps %zmm8, 192(%rsi)
17980; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17981; AVX512DQ-FCP-NEXT:    vmovaps %zmm8, 128(%rsi)
17982; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17983; AVX512DQ-FCP-NEXT:    vmovaps %zmm8, 64(%rsi)
17984; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17985; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, (%rsi)
17986; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17987; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 192(%rdx)
17988; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17989; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, (%rdx)
17990; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17991; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 64(%rdx)
17992; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17993; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 128(%rdx)
17994; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17995; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 192(%rcx)
17996; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17997; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, (%rcx)
17998; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17999; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 64(%rcx)
18000; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18001; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 128(%rcx)
18002; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18003; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 192(%r8)
18004; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18005; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, (%r8)
18006; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18007; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 64(%r8)
18008; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18009; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 128(%r8)
18010; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18011; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 192(%r9)
18012; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18013; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, (%r9)
18014; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18015; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 64(%r9)
18016; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18017; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 128(%r9)
18018; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
18019; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18020; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 192(%rax)
18021; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18022; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, (%rax)
18023; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18024; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 64(%rax)
18025; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18026; AVX512DQ-FCP-NEXT:    vmovaps %zmm6, 128(%rax)
18027; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
18028; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
18029; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
18030; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
18031; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
18032; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
18033; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
18034; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
18035; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
18036; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
18037; AVX512DQ-FCP-NEXT:    addq $3144, %rsp # imm = 0xC48
18038; AVX512DQ-FCP-NEXT:    vzeroupper
18039; AVX512DQ-FCP-NEXT:    retq
18040;
18041; AVX512BW-LABEL: load_i32_stride8_vf64:
18042; AVX512BW:       # %bb.0:
18043; AVX512BW-NEXT:    subq $3144, %rsp # imm = 0xC48
18044; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm11
18045; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18046; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm18
18047; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
18048; AVX512BW-NEXT:    vmovaps 1536(%rdi), %zmm0
18049; AVX512BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
18050; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm24
18051; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18052; AVX512BW-NEXT:    vmovaps 1664(%rdi), %zmm0
18053; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18054; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm21
18055; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm26
18056; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm22
18057; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm5
18058; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm13
18059; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm3
18060; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm30
18061; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
18062; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm29
18063; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm27
18064; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
18065; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
18066; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm25
18067; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
18068; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
18069; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18070; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
18071; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
18072; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18073; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
18074; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
18075; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
18076; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
18077; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18078; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
18079; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm16
18080; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18081; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
18082; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
18083; AVX512BW-NEXT:    movb $-64, %al
18084; AVX512BW-NEXT:    kmovd %eax, %k1
18085; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18086; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
18087; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm15
18088; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
18089; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
18090; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
18091; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
18092; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18093; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18094; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18095; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
18096; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6
18097; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18098; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18099; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
18100; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
18101; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18102; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
18103; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
18104; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
18105; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
18106; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
18107; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
18108; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18109; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18110; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18111; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm14
18112; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18113; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
18114; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
18115; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm17
18116; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18117; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
18118; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
18119; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18120; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18121; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
18122; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
18123; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
18124; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
18125; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm24
18126; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
18127; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18128; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
18129; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18130; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18131; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18132; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
18133; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18134; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm22
18135; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
18136; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
18137; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18138; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
18139; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18140; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
18141; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18142; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
18143; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
18144; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
18145; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
18146; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18147; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18148; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18149; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18150; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
18151; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18152; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
18153; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18154; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
18155; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18156; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
18157; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18158; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18159; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
18160; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm31
18161; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
18162; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm12
18163; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18164; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm30
18165; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
18166; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18167; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18168; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18169; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18170; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
18171; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18172; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
18173; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18174; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18175; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18176; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18177; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm1
18178; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18179; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
18180; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18181; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm12
18182; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
18183; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18184; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18185; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18186; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
18187; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
18188; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
18189; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
18190; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18191; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
18192; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18193; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
18194; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
18195; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
18196; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18197; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18198; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18199; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18200; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
18201; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18202; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
18203; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18204; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
18205; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18206; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18207; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
18208; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18209; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
18210; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
18211; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18212; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18213; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18214; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
18215; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18216; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
18217; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
18218; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18219; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
18220; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
18221; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18222; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
18223; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
18224; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
18225; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18226; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18227; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
18228; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18229; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
18230; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18231; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18232; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
18233; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm16
18234; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
18235; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm29
18236; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
18237; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18238; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18239; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18240; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
18241; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18242; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
18243; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18244; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
18245; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm7
18246; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
18247; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18248; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18249; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
18250; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm25
18251; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
18252; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
18253; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
18254; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm15
18255; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
18256; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18257; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18258; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18259; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
18260; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
18261; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18262; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
18263; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
18264; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18265; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm6
18266; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
18267; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
18268; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm24
18269; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18270; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
18271; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18272; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18273; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18274; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
18275; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18276; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
18277; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18278; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
18279; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18280; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
18281; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18282; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18283; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm21
18284; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
18285; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18286; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
18287; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18288; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18289; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18290; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18291; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1
18292; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18293; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18294; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
18295; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18296; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm31
18297; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18298; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18299; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
18300; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18301; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
18302; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18303; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18304; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18305; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
18306; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
18307; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
18308; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
18309; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm26
18310; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18311; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
18312; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm9
18313; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
18314; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
18315; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
18316; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18317; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18318; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18319; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
18320; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18321; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm25
18322; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
18323; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18324; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18325; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18326; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
18327; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm18
18328; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
18329; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
18330; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm24
18331; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13
18332; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18333; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18334; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18335; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
18336; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18337; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
18338; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18339; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18340; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18341; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
18342; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
18343; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
18344; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18345; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18346; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
18347; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm6
18348; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18349; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
18350; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18351; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18352; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
18353; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18354; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18355; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18356; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1
18357; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18358; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
18359; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
18360; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18361; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18362; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
18363; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
18364; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18365; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
18366; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm11
18367; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
18368; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18369; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18370; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18371; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
18372; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm1
18373; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
18374; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm31
18375; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18376; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
18377; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm19
18378; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
18379; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18380; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18381; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
18382; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm26
18383; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
18384; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
18385; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm29
18386; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18387; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18388; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18389; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18390; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
18391; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
18392; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
18393; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
18394; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm14
18395; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18396; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18397; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18398; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
18399; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18400; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
18401; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm13
18402; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18403; AVX512BW-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
18404; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18405; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18406; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18407; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
18408; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18409; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
18410; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
18411; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
18412; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
18413; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
18414; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18415; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
18416; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18417; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
18418; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18419; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
18420; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm1
18421; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18422; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
18423; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
18424; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18425; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
18426; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18427; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
18428; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm15
18429; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
18430; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm8
18431; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
18432; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
18433; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
18434; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18435; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18436; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18437; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm1
18438; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
18439; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm16
18440; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
18441; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm27
18442; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
18443; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm20
18444; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18445; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18446; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
18447; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
18448; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm17
18449; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
18450; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
18451; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
18452; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18453; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18454; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18455; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
18456; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
18457; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
18458; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
18459; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18460; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
18461; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18462; AVX512BW-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
18463; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18464; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18465; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18466; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
18467; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18468; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm4
18469; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm30
18470; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
18471; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
18472; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18473; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
18474; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm19
18475; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm11
18476; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
18477; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
18478; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9
18479; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
18480; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
18481; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18482; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18483; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
18484; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
18485; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
18486; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18487; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm31
18488; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
18489; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
18490; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm16
18491; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm29
18492; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
18493; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
18494; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm26
18495; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
18496; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm5
18497; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
18498; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
18499; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18500; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
18501; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18502; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
18503; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
18504; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm15
18505; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18506; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm23
18507; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18508; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
18509; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
18510; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17
18511; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18512; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm24
18513; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18514; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
18515; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
18516; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm27
18517; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18518; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13
18519; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18520; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
18521; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
18522; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18523; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
18524; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18525; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
18526; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
18527; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm6
18528; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm20
18529; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
18530; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
18531; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm21
18532; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm25
18533; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
18534; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
18535; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18536; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm14
18537; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
18538; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
18539; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18540; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18541; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
18542; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
18543; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
18544; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
18545; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
18546; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
18547; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
18548; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
18549; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
18550; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
18551; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
18552; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
18553; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
18554; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
18555; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
18556; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
18557; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18558; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
18559; AVX512BW-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
18560; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
18561; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
18562; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
18563; AVX512BW-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
18564; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
18565; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
18566; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
18567; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
18568; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
18569; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
18570; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
18571; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18572; AVX512BW-NEXT:    vmovaps %zmm8, 192(%rsi)
18573; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18574; AVX512BW-NEXT:    vmovaps %zmm8, 128(%rsi)
18575; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18576; AVX512BW-NEXT:    vmovaps %zmm8, 64(%rsi)
18577; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18578; AVX512BW-NEXT:    vmovaps %zmm6, (%rsi)
18579; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18580; AVX512BW-NEXT:    vmovaps %zmm6, 192(%rdx)
18581; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18582; AVX512BW-NEXT:    vmovaps %zmm6, (%rdx)
18583; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18584; AVX512BW-NEXT:    vmovaps %zmm6, 64(%rdx)
18585; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18586; AVX512BW-NEXT:    vmovaps %zmm6, 128(%rdx)
18587; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18588; AVX512BW-NEXT:    vmovaps %zmm6, 192(%rcx)
18589; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18590; AVX512BW-NEXT:    vmovaps %zmm6, (%rcx)
18591; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18592; AVX512BW-NEXT:    vmovaps %zmm6, 64(%rcx)
18593; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18594; AVX512BW-NEXT:    vmovaps %zmm6, 128(%rcx)
18595; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18596; AVX512BW-NEXT:    vmovaps %zmm6, 192(%r8)
18597; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18598; AVX512BW-NEXT:    vmovaps %zmm6, (%r8)
18599; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18600; AVX512BW-NEXT:    vmovaps %zmm6, 64(%r8)
18601; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18602; AVX512BW-NEXT:    vmovaps %zmm6, 128(%r8)
18603; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18604; AVX512BW-NEXT:    vmovaps %zmm6, 192(%r9)
18605; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18606; AVX512BW-NEXT:    vmovaps %zmm6, (%r9)
18607; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18608; AVX512BW-NEXT:    vmovaps %zmm6, 64(%r9)
18609; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18610; AVX512BW-NEXT:    vmovaps %zmm6, 128(%r9)
18611; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
18612; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18613; AVX512BW-NEXT:    vmovaps %zmm6, 192(%rax)
18614; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18615; AVX512BW-NEXT:    vmovaps %zmm6, (%rax)
18616; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18617; AVX512BW-NEXT:    vmovaps %zmm6, 64(%rax)
18618; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18619; AVX512BW-NEXT:    vmovaps %zmm6, 128(%rax)
18620; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
18621; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
18622; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
18623; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
18624; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
18625; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
18626; AVX512BW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
18627; AVX512BW-NEXT:    vmovdqa64 %zmm5, 192(%rax)
18628; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
18629; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
18630; AVX512BW-NEXT:    addq $3144, %rsp # imm = 0xC48
18631; AVX512BW-NEXT:    vzeroupper
18632; AVX512BW-NEXT:    retq
18633;
18634; AVX512BW-FCP-LABEL: load_i32_stride8_vf64:
18635; AVX512BW-FCP:       # %bb.0:
18636; AVX512BW-FCP-NEXT:    subq $3144, %rsp # imm = 0xC48
18637; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm11
18638; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18639; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm18
18640; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
18641; AVX512BW-FCP-NEXT:    vmovaps 1536(%rdi), %zmm0
18642; AVX512BW-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
18643; AVX512BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm24
18644; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18645; AVX512BW-FCP-NEXT:    vmovaps 1664(%rdi), %zmm0
18646; AVX512BW-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18647; AVX512BW-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm21
18648; AVX512BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm26
18649; AVX512BW-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm22
18650; AVX512BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm5
18651; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
18652; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm3
18653; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm30
18654; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
18655; AVX512BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm29
18656; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
18657; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
18658; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
18659; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm25
18660; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
18661; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
18662; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18663; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
18664; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
18665; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18666; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
18667; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
18668; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
18669; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
18670; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18671; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
18672; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
18673; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18674; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
18675; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
18676; AVX512BW-FCP-NEXT:    movb $-64, %al
18677; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
18678; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18679; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
18680; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
18681; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
18682; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
18683; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
18684; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
18685; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18686; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18687; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18688; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
18689; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm6
18690; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18691; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18692; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
18693; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
18694; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18695; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
18696; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
18697; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
18698; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
18699; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
18700; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
18701; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18702; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18703; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18704; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
18705; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18706; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
18707; AVX512BW-FCP-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
18708; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
18709; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18710; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
18711; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
18712; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18713; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18714; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
18715; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
18716; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
18717; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
18718; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
18719; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
18720; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18721; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
18722; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18723; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18724; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18725; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
18726; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18727; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
18728; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
18729; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
18730; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18731; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
18732; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18733; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
18734; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18735; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
18736; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
18737; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm19
18738; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
18739; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18740; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18741; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18742; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18743; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
18744; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18745; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
18746; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18747; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
18748; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18749; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
18750; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18751; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18752; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
18753; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm31
18754; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
18755; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm12
18756; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18757; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm30
18758; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
18759; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18760; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18761; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18762; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18763; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
18764; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18765; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
18766; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18767; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18768; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18769; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18770; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
18771; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18772; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
18773; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18774; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
18775; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
18776; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18777; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18778; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18779; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
18780; AVX512BW-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
18781; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
18782; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
18783; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18784; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
18785; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18786; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
18787; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
18788; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
18789; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18790; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18791; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18792; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18793; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
18794; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18795; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
18796; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18797; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
18798; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18799; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18800; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
18801; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18802; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
18803; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
18804; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18805; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18806; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18807; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
18808; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18809; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
18810; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
18811; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18812; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
18813; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
18814; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18815; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
18816; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
18817; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
18818; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18819; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18820; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
18821; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18822; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
18823; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18824; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18825; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
18826; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm16
18827; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
18828; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm29
18829; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
18830; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18831; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18832; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18833; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
18834; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18835; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
18836; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18837; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
18838; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm7
18839; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
18840; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18841; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18842; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
18843; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm25
18844; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
18845; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
18846; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
18847; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm15
18848; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
18849; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18850; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18851; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18852; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
18853; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
18854; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18855; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
18856; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
18857; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18858; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm6
18859; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
18860; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
18861; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm24
18862; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18863; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
18864; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18865; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18866; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18867; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
18868; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18869; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
18870; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18871; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
18872; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18873; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
18874; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18875; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18876; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm21
18877; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
18878; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18879; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
18880; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18881; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18882; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18883; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18884; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm1
18885; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18886; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18887; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
18888; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18889; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm31
18890; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18891; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18892; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
18893; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18894; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
18895; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18896; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18897; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18898; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
18899; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
18900; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
18901; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
18902; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm26
18903; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18904; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
18905; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
18906; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
18907; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
18908; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
18909; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18910; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18911; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18912; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
18913; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18914; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm25
18915; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
18916; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18917; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18918; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18919; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
18920; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm18
18921; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
18922; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
18923; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm24
18924; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
18925; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18926; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18927; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18928; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
18929; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18930; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
18931; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18932; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
18933; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18934; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
18935; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
18936; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
18937; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18938; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18939; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
18940; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
18941; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18942; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
18943; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18944; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18945; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
18946; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18947; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18948; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18949; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm1
18950; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
18951; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
18952; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
18953; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18954; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18955; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
18956; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
18957; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18958; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
18959; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm11
18960; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
18961; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18962; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18963; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18964; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
18965; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
18966; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
18967; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm31
18968; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18969; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
18970; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm19
18971; AVX512BW-FCP-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
18972; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18973; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18974; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
18975; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm26
18976; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
18977; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
18978; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm29
18979; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18980; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18981; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18982; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18983; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
18984; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
18985; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
18986; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
18987; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
18988; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
18989; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
18990; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18991; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
18992; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
18993; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm5
18994; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm13
18995; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18996; AVX512BW-FCP-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
18997; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18998; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18999; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19000; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
19001; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19002; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
19003; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
19004; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
19005; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
19006; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
19007; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19008; AVX512BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
19009; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19010; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
19011; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19012; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
19013; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm1
19014; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19015; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
19016; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
19017; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19018; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
19019; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19020; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
19021; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm15
19022; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
19023; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm8
19024; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
19025; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
19026; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
19027; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19028; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19029; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19030; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
19031; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
19032; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm16
19033; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
19034; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm27
19035; AVX512BW-FCP-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
19036; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm20
19037; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19038; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19039; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
19040; AVX512BW-FCP-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
19041; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm17
19042; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
19043; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
19044; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
19045; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19046; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19047; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19048; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
19049; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
19050; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
19051; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
19052; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19053; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
19054; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19055; AVX512BW-FCP-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
19056; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19057; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19058; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19059; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
19060; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19061; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm4
19062; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm30
19063; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
19064; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
19065; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19066; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
19067; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm19
19068; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
19069; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
19070; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
19071; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
19072; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
19073; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
19074; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19075; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19076; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm8
19077; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
19078; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
19079; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19080; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm31
19081; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
19082; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
19083; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm16
19084; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm29
19085; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
19086; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
19087; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm26
19088; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
19089; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm5
19090; AVX512BW-FCP-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
19091; AVX512BW-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
19092; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19093; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
19094; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19095; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
19096; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
19097; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm15
19098; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19099; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm23
19100; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19101; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
19102; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
19103; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm17
19104; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19105; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm24
19106; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19107; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
19108; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
19109; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm27
19110; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19111; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm13
19112; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19113; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
19114; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
19115; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19116; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm3
19117; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19118; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
19119; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
19120; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
19121; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20
19122; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
19123; AVX512BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
19124; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm21
19125; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm25
19126; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
19127; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
19128; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19129; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm14
19130; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
19131; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
19132; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19133; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19134; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
19135; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
19136; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6
19137; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
19138; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
19139; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
19140; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
19141; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
19142; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
19143; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
19144; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
19145; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
19146; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
19147; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
19148; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
19149; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
19150; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19151; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
19152; AVX512BW-FCP-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
19153; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
19154; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
19155; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
19156; AVX512BW-FCP-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
19157; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
19158; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
19159; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
19160; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
19161; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
19162; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
19163; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
19164; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19165; AVX512BW-FCP-NEXT:    vmovaps %zmm8, 192(%rsi)
19166; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19167; AVX512BW-FCP-NEXT:    vmovaps %zmm8, 128(%rsi)
19168; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19169; AVX512BW-FCP-NEXT:    vmovaps %zmm8, 64(%rsi)
19170; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19171; AVX512BW-FCP-NEXT:    vmovaps %zmm6, (%rsi)
19172; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19173; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 192(%rdx)
19174; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19175; AVX512BW-FCP-NEXT:    vmovaps %zmm6, (%rdx)
19176; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19177; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 64(%rdx)
19178; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19179; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 128(%rdx)
19180; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19181; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 192(%rcx)
19182; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19183; AVX512BW-FCP-NEXT:    vmovaps %zmm6, (%rcx)
19184; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19185; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 64(%rcx)
19186; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19187; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 128(%rcx)
19188; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19189; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 192(%r8)
19190; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19191; AVX512BW-FCP-NEXT:    vmovaps %zmm6, (%r8)
19192; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19193; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 64(%r8)
19194; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19195; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 128(%r8)
19196; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19197; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 192(%r9)
19198; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19199; AVX512BW-FCP-NEXT:    vmovaps %zmm6, (%r9)
19200; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19201; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 64(%r9)
19202; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19203; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 128(%r9)
19204; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
19205; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19206; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 192(%rax)
19207; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19208; AVX512BW-FCP-NEXT:    vmovaps %zmm6, (%rax)
19209; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19210; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 64(%rax)
19211; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19212; AVX512BW-FCP-NEXT:    vmovaps %zmm6, 128(%rax)
19213; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
19214; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
19215; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
19216; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
19217; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
19218; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
19219; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
19220; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
19221; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
19222; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
19223; AVX512BW-FCP-NEXT:    addq $3144, %rsp # imm = 0xC48
19224; AVX512BW-FCP-NEXT:    vzeroupper
19225; AVX512BW-FCP-NEXT:    retq
19226;
19227; AVX512DQ-BW-LABEL: load_i32_stride8_vf64:
19228; AVX512DQ-BW:       # %bb.0:
19229; AVX512DQ-BW-NEXT:    subq $3144, %rsp # imm = 0xC48
19230; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm11
19231; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19232; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm18
19233; AVX512DQ-BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
19234; AVX512DQ-BW-NEXT:    vmovaps 1536(%rdi), %zmm0
19235; AVX512DQ-BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
19236; AVX512DQ-BW-NEXT:    vmovdqa64 1728(%rdi), %zmm24
19237; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19238; AVX512DQ-BW-NEXT:    vmovaps 1664(%rdi), %zmm0
19239; AVX512DQ-BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19240; AVX512DQ-BW-NEXT:    vmovdqa64 1856(%rdi), %zmm21
19241; AVX512DQ-BW-NEXT:    vmovdqa64 1792(%rdi), %zmm26
19242; AVX512DQ-BW-NEXT:    vmovdqa64 1984(%rdi), %zmm22
19243; AVX512DQ-BW-NEXT:    vmovdqa64 1920(%rdi), %zmm5
19244; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm13
19245; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm3
19246; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm30
19247; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
19248; AVX512DQ-BW-NEXT:    vmovdqa64 1344(%rdi), %zmm29
19249; AVX512DQ-BW-NEXT:    vmovdqa64 1280(%rdi), %zmm27
19250; AVX512DQ-BW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
19251; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
19252; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm25
19253; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
19254; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
19255; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19256; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
19257; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
19258; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19259; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
19260; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
19261; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
19262; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
19263; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19264; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm1
19265; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm16
19266; AVX512DQ-BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
19267; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
19268; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
19269; AVX512DQ-BW-NEXT:    movb $-64, %al
19270; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
19271; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19272; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm1
19273; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm15
19274; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
19275; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm12
19276; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm9
19277; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
19278; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19279; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19280; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19281; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm1
19282; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm6
19283; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19284; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
19285; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm4
19286; AVX512DQ-BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
19287; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19288; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm1
19289; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm8
19290; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
19291; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
19292; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm12
19293; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
19294; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19295; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19296; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19297; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm14
19298; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19299; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm1
19300; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
19301; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm17
19302; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19303; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm4
19304; AVX512DQ-BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
19305; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19306; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19307; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm1
19308; AVX512DQ-BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
19309; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
19310; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm12
19311; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm31, %zmm24
19312; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
19313; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19314; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
19315; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19316; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19317; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19318; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm1
19319; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19320; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm22
19321; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm4
19322; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
19323; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19324; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
19325; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19326; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
19327; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19328; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
19329; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm7
19330; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
19331; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
19332; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19333; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19334; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19335; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19336; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
19337; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19338; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm1
19339; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
19340; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm4
19341; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19342; AVX512DQ-BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
19343; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19344; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19345; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm1
19346; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm31
19347; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
19348; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm12
19349; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19350; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm30
19351; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
19352; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19353; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19354; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19355; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19356; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm1
19357; AVX512DQ-BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
19358; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
19359; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19360; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
19361; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19362; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19363; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm1
19364; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19365; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
19366; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19367; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm12
19368; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
19369; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19370; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19371; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19372; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm1
19373; AVX512DQ-BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
19374; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm4
19375; AVX512DQ-BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
19376; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19377; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm1
19378; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19379; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
19380; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm12
19381; AVX512DQ-BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
19382; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19383; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19384; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19385; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19386; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm1
19387; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19388; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm4
19389; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19390; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
19391; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19392; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19393; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm1
19394; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19395; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
19396; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
19397; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19398; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19399; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19400; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
19401; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19402; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19403; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm1
19404; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
19405; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm4
19406; AVX512DQ-BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
19407; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19408; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
19409; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
19410; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
19411; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19412; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19413; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm1
19414; AVX512DQ-BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
19415; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
19416; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
19417; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19418; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
19419; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm16
19420; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
19421; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm29
19422; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
19423; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19424; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19425; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19426; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm1
19427; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19428; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
19429; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19430; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm4
19431; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, %zmm7
19432; AVX512DQ-BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
19433; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19434; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19435; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm1
19436; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm25
19437; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
19438; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
19439; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm12
19440; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm15
19441; AVX512DQ-BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
19442; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19443; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19444; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19445; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm13
19446; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm1
19447; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19448; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm4
19449; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
19450; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19451; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm6
19452; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm1
19453; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
19454; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm24
19455; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19456; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
19457; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19458; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19459; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19460; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
19461; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19462; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm1
19463; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
19464; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm4
19465; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19466; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
19467; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19468; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19469; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm31, %zmm21
19470; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
19471; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19472; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
19473; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19474; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19475; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19476; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19477; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm1
19478; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19479; AVX512DQ-BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
19480; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
19481; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
19482; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm31
19483; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19484; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19485; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
19486; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19487; AVX512DQ-BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
19488; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19489; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19490; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19491; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm1
19492; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
19493; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm4
19494; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
19495; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm26
19496; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19497; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm1
19498; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm9
19499; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
19500; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm12
19501; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
19502; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19503; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19504; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19505; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm1
19506; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19507; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm25
19508; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm4
19509; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19510; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
19511; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19512; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm1
19513; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm18
19514; AVX512DQ-BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
19515; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
19516; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, %zmm24
19517; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm13
19518; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19519; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19520; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19521; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
19522; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19523; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm1
19524; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19525; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
19526; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19527; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm4
19528; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm3
19529; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
19530; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19531; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19532; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm1
19533; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, %zmm6
19534; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19535; AVX512DQ-BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
19536; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19537; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19538; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
19539; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19540; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19541; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19542; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm1
19543; AVX512DQ-BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
19544; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
19545; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
19546; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19547; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19548; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm1
19549; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
19550; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19551; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm12
19552; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, %zmm11
19553; AVX512DQ-BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
19554; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19555; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19556; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19557; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
19558; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, %zmm1
19559; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
19560; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm31
19561; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19562; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm4
19563; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm19
19564; AVX512DQ-BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
19565; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19566; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19567; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
19568; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm26
19569; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
19570; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
19571; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm29
19572; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19573; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19574; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19575; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19576; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19577; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, %zmm1
19578; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
19579; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm4
19580; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm14
19581; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
19582; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19583; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19584; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm1
19585; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19586; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm5
19587; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm13
19588; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19589; AVX512DQ-BW-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
19590; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19591; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19592; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19593; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
19594; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19595; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
19596; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
19597; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
19598; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm1
19599; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
19600; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19601; AVX512DQ-BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
19602; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19603; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
19604; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19605; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
19606; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm1
19607; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19608; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
19609; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
19610; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19611; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
19612; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19613; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm1
19614; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm15
19615; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
19616; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm8
19617; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm12
19618; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
19619; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm2
19620; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19621; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19622; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19623; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, %zmm1
19624; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
19625; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm31, %zmm16
19626; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm4
19627; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm27
19628; AVX512DQ-BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
19629; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, %zmm20
19630; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19631; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19632; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm1
19633; AVX512DQ-BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
19634; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm17
19635; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
19636; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm12
19637; AVX512DQ-BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
19638; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19639; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19640; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19641; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, %zmm1
19642; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
19643; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm4
19644; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
19645; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19646; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm1
19647; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19648; AVX512DQ-BW-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
19649; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19650; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19651; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19652; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
19653; AVX512DQ-BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19654; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm4
19655; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm30
19656; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
19657; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
19658; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19659; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
19660; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm19
19661; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm11
19662; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
19663; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
19664; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm9
19665; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
19666; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
19667; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19668; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19669; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm8
19670; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
19671; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
19672; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19673; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, %zmm31
19674; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
19675; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
19676; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, %zmm16
19677; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm29
19678; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
19679; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
19680; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm26
19681; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm2
19682; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm5
19683; AVX512DQ-BW-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
19684; AVX512DQ-BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
19685; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19686; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm4
19687; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19688; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
19689; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
19690; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm15
19691; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19692; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm23
19693; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19694; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
19695; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
19696; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm17
19697; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19698; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm24
19699; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19700; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
19701; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
19702; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm27
19703; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19704; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm13
19705; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19706; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
19707; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
19708; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19709; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm3
19710; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19711; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
19712; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
19713; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, %zmm6
19714; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, %zmm20
19715; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
19716; AVX512DQ-BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
19717; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm21
19718; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm25
19719; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
19720; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
19721; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19722; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm14
19723; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
19724; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
19725; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19726; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19727; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
19728; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
19729; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm6
19730; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
19731; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
19732; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
19733; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
19734; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
19735; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
19736; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
19737; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
19738; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
19739; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
19740; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
19741; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
19742; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
19743; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19744; AVX512DQ-BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
19745; AVX512DQ-BW-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
19746; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
19747; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
19748; AVX512DQ-BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
19749; AVX512DQ-BW-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
19750; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
19751; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
19752; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
19753; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
19754; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
19755; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
19756; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
19757; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19758; AVX512DQ-BW-NEXT:    vmovaps %zmm8, 192(%rsi)
19759; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19760; AVX512DQ-BW-NEXT:    vmovaps %zmm8, 128(%rsi)
19761; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19762; AVX512DQ-BW-NEXT:    vmovaps %zmm8, 64(%rsi)
19763; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19764; AVX512DQ-BW-NEXT:    vmovaps %zmm6, (%rsi)
19765; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19766; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 192(%rdx)
19767; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19768; AVX512DQ-BW-NEXT:    vmovaps %zmm6, (%rdx)
19769; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19770; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 64(%rdx)
19771; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19772; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 128(%rdx)
19773; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19774; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 192(%rcx)
19775; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19776; AVX512DQ-BW-NEXT:    vmovaps %zmm6, (%rcx)
19777; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19778; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 64(%rcx)
19779; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19780; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 128(%rcx)
19781; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19782; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 192(%r8)
19783; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19784; AVX512DQ-BW-NEXT:    vmovaps %zmm6, (%r8)
19785; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19786; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 64(%r8)
19787; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19788; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 128(%r8)
19789; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19790; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 192(%r9)
19791; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19792; AVX512DQ-BW-NEXT:    vmovaps %zmm6, (%r9)
19793; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19794; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 64(%r9)
19795; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19796; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 128(%r9)
19797; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
19798; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19799; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 192(%rax)
19800; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19801; AVX512DQ-BW-NEXT:    vmovaps %zmm6, (%rax)
19802; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19803; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 64(%rax)
19804; AVX512DQ-BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19805; AVX512DQ-BW-NEXT:    vmovaps %zmm6, 128(%rax)
19806; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
19807; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
19808; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%rax)
19809; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
19810; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
19811; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
19812; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
19813; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 192(%rax)
19814; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rax)
19815; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
19816; AVX512DQ-BW-NEXT:    addq $3144, %rsp # imm = 0xC48
19817; AVX512DQ-BW-NEXT:    vzeroupper
19818; AVX512DQ-BW-NEXT:    retq
19819;
19820; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf64:
19821; AVX512DQ-BW-FCP:       # %bb.0:
19822; AVX512DQ-BW-FCP-NEXT:    subq $3144, %rsp # imm = 0xC48
19823; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm11
19824; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19825; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm18
19826; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
19827; AVX512DQ-BW-FCP-NEXT:    vmovaps 1536(%rdi), %zmm0
19828; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
19829; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm24
19830; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19831; AVX512DQ-BW-FCP-NEXT:    vmovaps 1664(%rdi), %zmm0
19832; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19833; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm21
19834; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm26
19835; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm22
19836; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm5
19837; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
19838; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm3
19839; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm30
19840; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
19841; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm29
19842; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
19843; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
19844; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
19845; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm25
19846; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
19847; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
19848; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19849; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
19850; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
19851; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19852; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
19853; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
19854; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
19855; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
19856; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19857; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
19858; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
19859; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
19860; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
19861; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
19862; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
19863; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
19864; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19865; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
19866; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
19867; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
19868; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
19869; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
19870; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
19871; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19872; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19873; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19874; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
19875; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm6
19876; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19877; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
19878; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
19879; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
19880; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19881; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
19882; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
19883; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
19884; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
19885; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
19886; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
19887; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19888; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19889; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19890; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
19891; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19892; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
19893; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
19894; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
19895; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19896; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
19897; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
19898; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19899; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19900; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
19901; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
19902; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
19903; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
19904; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
19905; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
19906; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19907; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
19908; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19909; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19910; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19911; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
19912; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19913; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
19914; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
19915; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
19916; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19917; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
19918; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19919; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
19920; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19921; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
19922; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
19923; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm19
19924; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
19925; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19926; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19927; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19928; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19929; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
19930; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19931; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
19932; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
19933; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
19934; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19935; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
19936; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19937; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19938; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
19939; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm31
19940; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
19941; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm12
19942; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19943; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm30
19944; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
19945; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19946; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19947; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19948; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19949; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
19950; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
19951; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
19952; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19953; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
19954; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19955; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19956; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
19957; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19958; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
19959; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19960; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
19961; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
19962; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19963; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19964; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19965; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
19966; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
19967; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
19968; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
19969; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19970; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
19971; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19972; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
19973; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
19974; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
19975; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19976; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19977; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19978; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19979; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
19980; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
19981; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
19982; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19983; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
19984; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
19985; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19986; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
19987; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19988; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
19989; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
19990; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19991; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19992; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19993; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
19994; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19995; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19996; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
19997; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
19998; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
19999; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
20000; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20001; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
20002; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
20003; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
20004; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20005; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20006; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
20007; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
20008; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
20009; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
20010; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20011; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
20012; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm16
20013; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
20014; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm29
20015; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
20016; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20017; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20018; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20019; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
20020; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20021; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
20022; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20023; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
20024; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm7
20025; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
20026; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20027; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20028; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
20029; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm25
20030; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
20031; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
20032; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
20033; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm15
20034; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
20035; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20036; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20037; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20038; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
20039; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
20040; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
20041; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
20042; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
20043; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20044; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm6
20045; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
20046; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
20047; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm24
20048; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20049; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
20050; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20051; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20052; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20053; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
20054; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20055; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
20056; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
20057; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
20058; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20059; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
20060; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20061; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20062; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm21
20063; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
20064; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20065; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
20066; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20067; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20068; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20069; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20070; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm1
20071; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20072; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
20073; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
20074; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
20075; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm31
20076; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20077; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20078; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
20079; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20080; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
20081; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20082; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20083; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20084; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
20085; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
20086; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
20087; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
20088; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm26
20089; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20090; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
20091; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
20092; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
20093; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
20094; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
20095; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20096; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20097; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20098; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
20099; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
20100; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm25
20101; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
20102; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20103; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
20104; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20105; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
20106; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm18
20107; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
20108; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
20109; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm24
20110; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
20111; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20112; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20113; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20114; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
20115; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20116; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
20117; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20118; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
20119; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20120; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
20121; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
20122; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
20123; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20124; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20125; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
20126; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
20127; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20128; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
20129; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20130; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20131; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
20132; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20133; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20134; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20135; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm1
20136; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
20137; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
20138; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
20139; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20140; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20141; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
20142; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
20143; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20144; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
20145; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm11
20146; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
20147; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20148; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20149; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20150; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
20151; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
20152; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
20153; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm31
20154; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20155; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
20156; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm19
20157; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
20158; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20159; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20160; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
20161; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm26
20162; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
20163; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
20164; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm29
20165; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20166; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20167; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20168; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20169; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20170; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
20171; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
20172; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
20173; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
20174; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
20175; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20176; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20177; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
20178; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
20179; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm5
20180; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm13
20181; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20182; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
20183; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20184; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20185; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20186; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
20187; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20188; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
20189; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
20190; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
20191; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
20192; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
20193; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20194; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
20195; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20196; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
20197; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20198; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
20199; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm1
20200; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20201; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
20202; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
20203; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20204; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
20205; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20206; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
20207; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm15
20208; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
20209; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm8
20210; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
20211; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
20212; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
20213; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20214; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20215; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20216; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
20217; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
20218; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm16
20219; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
20220; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm27
20221; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
20222; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm20
20223; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20224; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20225; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
20226; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
20227; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm17
20228; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
20229; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
20230; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
20231; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20232; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20233; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20234; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
20235; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
20236; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm4
20237; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
20238; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
20239; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
20240; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
20241; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
20242; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20243; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20244; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20245; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
20246; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20247; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm4
20248; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm30
20249; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
20250; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
20251; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20252; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
20253; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm19
20254; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
20255; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
20256; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
20257; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
20258; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
20259; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
20260; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20261; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20262; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm8
20263; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
20264; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
20265; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20266; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm31
20267; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
20268; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
20269; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm16
20270; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm29
20271; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
20272; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
20273; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm26
20274; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
20275; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm5
20276; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
20277; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
20278; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20279; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
20280; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20281; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
20282; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
20283; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm15
20284; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20285; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm23
20286; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20287; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
20288; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
20289; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm17
20290; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20291; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm24
20292; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20293; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
20294; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
20295; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm27
20296; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20297; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm13
20298; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20299; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
20300; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
20301; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20302; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm3
20303; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20304; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
20305; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
20306; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
20307; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20
20308; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
20309; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
20310; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm21
20311; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm25
20312; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
20313; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
20314; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20315; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm14
20316; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
20317; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
20318; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20319; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20320; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
20321; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
20322; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6
20323; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
20324; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
20325; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
20326; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
20327; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
20328; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
20329; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
20330; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
20331; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
20332; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
20333; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
20334; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
20335; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
20336; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20337; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
20338; AVX512DQ-BW-FCP-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
20339; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
20340; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
20341; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
20342; AVX512DQ-BW-FCP-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
20343; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
20344; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
20345; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
20346; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
20347; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
20348; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
20349; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
20350; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20351; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm8, 192(%rsi)
20352; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20353; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm8, 128(%rsi)
20354; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20355; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm8, 64(%rsi)
20356; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20357; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, (%rsi)
20358; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20359; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 192(%rdx)
20360; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20361; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, (%rdx)
20362; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20363; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 64(%rdx)
20364; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20365; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 128(%rdx)
20366; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20367; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 192(%rcx)
20368; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20369; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, (%rcx)
20370; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20371; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 64(%rcx)
20372; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20373; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 128(%rcx)
20374; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20375; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 192(%r8)
20376; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20377; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, (%r8)
20378; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20379; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 64(%r8)
20380; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20381; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 128(%r8)
20382; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20383; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 192(%r9)
20384; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20385; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, (%r9)
20386; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20387; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 64(%r9)
20388; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20389; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 128(%r9)
20390; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
20391; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20392; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 192(%rax)
20393; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20394; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, (%rax)
20395; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20396; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 64(%rax)
20397; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20398; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm6, 128(%rax)
20399; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
20400; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
20401; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
20402; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
20403; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
20404; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
20405; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
20406; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
20407; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
20408; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
20409; AVX512DQ-BW-FCP-NEXT:    addq $3144, %rsp # imm = 0xC48
20410; AVX512DQ-BW-FCP-NEXT:    vzeroupper
20411; AVX512DQ-BW-FCP-NEXT:    retq
20412  %wide.vec = load <512 x i32>, ptr %in.vec, align 64
20413  %strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
20414  %strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
20415  %strided.vec2 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250, i32 258, i32 266, i32 274, i32 282, i32 290, i32 298, i32 306, i32 314, i32 322, i32 330, i32 338, i32 346, i32 354, i32 362, i32 370, i32 378, i32 386, i32 394, i32 402, i32 410, i32 418, i32 426, i32 434, i32 442, i32 450, i32 458, i32 466, i32 474, i32 482, i32 490, i32 498, i32 506>
20416  %strided.vec3 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251, i32 259, i32 267, i32 275, i32 283, i32 291, i32 299, i32 307, i32 315, i32 323, i32 331, i32 339, i32 347, i32 355, i32 363, i32 371, i32 379, i32 387, i32 395, i32 403, i32 411, i32 419, i32 427, i32 435, i32 443, i32 451, i32 459, i32 467, i32 475, i32 483, i32 491, i32 499, i32 507>
20417  %strided.vec4 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252, i32 260, i32 268, i32 276, i32 284, i32 292, i32 300, i32 308, i32 316, i32 324, i32 332, i32 340, i32 348, i32 356, i32 364, i32 372, i32 380, i32 388, i32 396, i32 404, i32 412, i32 420, i32 428, i32 436, i32 444, i32 452, i32 460, i32 468, i32 476, i32 484, i32 492, i32 500, i32 508>
20418  %strided.vec5 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253, i32 261, i32 269, i32 277, i32 285, i32 293, i32 301, i32 309, i32 317, i32 325, i32 333, i32 341, i32 349, i32 357, i32 365, i32 373, i32 381, i32 389, i32 397, i32 405, i32 413, i32 421, i32 429, i32 437, i32 445, i32 453, i32 461, i32 469, i32 477, i32 485, i32 493, i32 501, i32 509>
20419  %strided.vec6 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254, i32 262, i32 270, i32 278, i32 286, i32 294, i32 302, i32 310, i32 318, i32 326, i32 334, i32 342, i32 350, i32 358, i32 366, i32 374, i32 382, i32 390, i32 398, i32 406, i32 414, i32 422, i32 430, i32 438, i32 446, i32 454, i32 462, i32 470, i32 478, i32 486, i32 494, i32 502, i32 510>
20420  %strided.vec7 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255, i32 263, i32 271, i32 279, i32 287, i32 295, i32 303, i32 311, i32 319, i32 327, i32 335, i32 343, i32 351, i32 359, i32 367, i32 375, i32 383, i32 391, i32 399, i32 407, i32 415, i32 423, i32 431, i32 439, i32 447, i32 455, i32 463, i32 471, i32 479, i32 487, i32 495, i32 503, i32 511>
20421  store <64 x i32> %strided.vec0, ptr %out.vec0, align 64
20422  store <64 x i32> %strided.vec1, ptr %out.vec1, align 64
20423  store <64 x i32> %strided.vec2, ptr %out.vec2, align 64
20424  store <64 x i32> %strided.vec3, ptr %out.vec3, align 64
20425  store <64 x i32> %strided.vec4, ptr %out.vec4, align 64
20426  store <64 x i32> %strided.vec5, ptr %out.vec5, align 64
20427  store <64 x i32> %strided.vec6, ptr %out.vec6, align 64
20428  store <64 x i32> %strided.vec7, ptr %out.vec7, align 64
20429  ret void
20430}
20431