xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll (revision 7457f51f6cf61b960e3e6e45e63378debd5c1d5c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
19; SSE-LABEL: load_i32_stride7_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
23; SSE-NEXT:    movdqa (%rdi), %xmm0
24; SSE-NEXT:    movdqa 16(%rdi), %xmm1
25; SSE-NEXT:    movdqa 32(%rdi), %xmm2
26; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
27; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
28; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
29; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,2,3,3]
30; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
31; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
32; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
33; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
34; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
35; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
36; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
37; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
38; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
39; SSE-NEXT:    movdqa 48(%rdi), %xmm2
40; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
41; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
42; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
43; SSE-NEXT:    movq %xmm0, (%rsi)
44; SSE-NEXT:    movq %xmm4, (%rdx)
45; SSE-NEXT:    movq %xmm5, (%rcx)
46; SSE-NEXT:    movq %xmm6, (%r8)
47; SSE-NEXT:    movq %xmm1, (%r9)
48; SSE-NEXT:    movq %xmm3, (%r10)
49; SSE-NEXT:    movq %xmm7, (%rax)
50; SSE-NEXT:    retq
51;
52; AVX-LABEL: load_i32_stride7_vf2:
53; AVX:       # %bb.0:
54; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
55; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
56; AVX-NEXT:    vmovaps (%rdi), %ymm0
57; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
58; AVX-NEXT:    vmovaps (%rdi), %xmm2
59; AVX-NEXT:    vmovaps 16(%rdi), %xmm3
60; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
61; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3]
62; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3]
63; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3]
64; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3]
65; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3]
66; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
67; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
68; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
69; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
70; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
71; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4]
72; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
73; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3]
74; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4]
75; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
76; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
77; AVX-NEXT:    vmovlps %xmm5, (%rsi)
78; AVX-NEXT:    vmovlps %xmm6, (%rdx)
79; AVX-NEXT:    vmovlps %xmm7, (%rcx)
80; AVX-NEXT:    vmovlps %xmm2, (%r8)
81; AVX-NEXT:    vmovlps %xmm3, (%r9)
82; AVX-NEXT:    vmovlps %xmm4, (%r10)
83; AVX-NEXT:    vmovlps %xmm0, (%rax)
84; AVX-NEXT:    vzeroupper
85; AVX-NEXT:    retq
86;
87; AVX2-LABEL: load_i32_stride7_vf2:
88; AVX2:       # %bb.0:
89; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
90; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
91; AVX2-NEXT:    vmovaps (%rdi), %ymm0
92; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
93; AVX2-NEXT:    vbroadcastss 28(%rdi), %xmm2
94; AVX2-NEXT:    vmovaps (%rdi), %xmm3
95; AVX2-NEXT:    vmovaps 32(%rdi), %xmm4
96; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
97; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
98; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
99; AVX2-NEXT:    vbroadcastss 8(%rdi), %xmm6
100; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
101; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
102; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
103; AVX2-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
104; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
105; AVX2-NEXT:    vpermps %ymm7, %ymm4, %ymm4
106; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
107; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
108; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
109; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
110; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
111; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
112; AVX2-NEXT:    vmovlps %xmm2, (%rsi)
113; AVX2-NEXT:    vmovlps %xmm5, (%rdx)
114; AVX2-NEXT:    vmovlps %xmm6, (%rcx)
115; AVX2-NEXT:    vmovlps %xmm3, (%r8)
116; AVX2-NEXT:    vmovlps %xmm4, (%r9)
117; AVX2-NEXT:    vmovlps %xmm7, (%r10)
118; AVX2-NEXT:    vmovlps %xmm0, (%rax)
119; AVX2-NEXT:    vzeroupper
120; AVX2-NEXT:    retq
121;
122; AVX2-FP-LABEL: load_i32_stride7_vf2:
123; AVX2-FP:       # %bb.0:
124; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
125; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
126; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
127; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
128; AVX2-FP-NEXT:    vbroadcastss 28(%rdi), %xmm2
129; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm3
130; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm4
131; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
132; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
133; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
134; AVX2-FP-NEXT:    vbroadcastss 8(%rdi), %xmm6
135; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
136; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
137; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
138; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
139; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
140; AVX2-FP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
141; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
142; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
143; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
144; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
145; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
146; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
147; AVX2-FP-NEXT:    vmovlps %xmm2, (%rsi)
148; AVX2-FP-NEXT:    vmovlps %xmm5, (%rdx)
149; AVX2-FP-NEXT:    vmovlps %xmm6, (%rcx)
150; AVX2-FP-NEXT:    vmovlps %xmm3, (%r8)
151; AVX2-FP-NEXT:    vmovlps %xmm4, (%r9)
152; AVX2-FP-NEXT:    vmovlps %xmm7, (%r10)
153; AVX2-FP-NEXT:    vmovlps %xmm0, (%rax)
154; AVX2-FP-NEXT:    vzeroupper
155; AVX2-FP-NEXT:    retq
156;
157; AVX2-FCP-LABEL: load_i32_stride7_vf2:
158; AVX2-FCP:       # %bb.0:
159; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
160; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
161; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
162; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
163; AVX2-FCP-NEXT:    vbroadcastss 28(%rdi), %xmm2
164; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm3
165; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm4
166; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
167; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
168; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
169; AVX2-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm6
170; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
171; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
172; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
173; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
174; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
175; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
176; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
177; AVX2-FCP-NEXT:    vextractf128 $1, %ymm7, %xmm7
178; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
179; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
180; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
181; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
182; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rsi)
183; AVX2-FCP-NEXT:    vmovlps %xmm5, (%rdx)
184; AVX2-FCP-NEXT:    vmovlps %xmm6, (%rcx)
185; AVX2-FCP-NEXT:    vmovlps %xmm3, (%r8)
186; AVX2-FCP-NEXT:    vmovlps %xmm4, (%r9)
187; AVX2-FCP-NEXT:    vmovlps %xmm7, (%r10)
188; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rax)
189; AVX2-FCP-NEXT:    vzeroupper
190; AVX2-FCP-NEXT:    retq
191;
192; AVX512-LABEL: load_i32_stride7_vf2:
193; AVX512:       # %bb.0:
194; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
195; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
196; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
197; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
198; AVX512-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
199; AVX512-NEXT:    vmovd %xmm1, %r11d
200; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
201; AVX512-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
202; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm4
203; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
204; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
205; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
206; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
207; AVX512-NEXT:    vpermps (%rdi), %zmm1, %zmm1
208; AVX512-NEXT:    vmovaps (%rdi), %ymm5
209; AVX512-NEXT:    vmovaps 32(%rdi), %ymm6
210; AVX512-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
211; AVX512-NEXT:    vextractf128 $1, %ymm7, %xmm7
212; AVX512-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
213; AVX512-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
214; AVX512-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
215; AVX512-NEXT:    vextractf128 $1, %ymm5, %xmm5
216; AVX512-NEXT:    vmovq %xmm2, (%rsi)
217; AVX512-NEXT:    vmovq %xmm3, (%rdx)
218; AVX512-NEXT:    vmovq %xmm4, (%rcx)
219; AVX512-NEXT:    vmovq %xmm0, (%r8)
220; AVX512-NEXT:    vmovlps %xmm1, (%r9)
221; AVX512-NEXT:    vmovlps %xmm7, (%r10)
222; AVX512-NEXT:    vmovlps %xmm5, (%rax)
223; AVX512-NEXT:    vzeroupper
224; AVX512-NEXT:    retq
225;
226; AVX512-FCP-LABEL: load_i32_stride7_vf2:
227; AVX512-FCP:       # %bb.0:
228; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
229; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
230; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm0
231; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
232; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
233; AVX512-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
234; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
235; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
236; AVX512-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
237; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
238; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
239; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
240; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
241; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
242; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
243; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
244; AVX512-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
245; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
246; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
247; AVX512-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
248; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
249; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
250; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
251; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
252; AVX512-FCP-NEXT:    vmovlps %xmm1, (%r9)
253; AVX512-FCP-NEXT:    vmovq %xmm7, (%r10)
254; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rax)
255; AVX512-FCP-NEXT:    vzeroupper
256; AVX512-FCP-NEXT:    retq
257;
258; AVX512DQ-LABEL: load_i32_stride7_vf2:
259; AVX512DQ:       # %bb.0:
260; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
261; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
262; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
263; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
264; AVX512DQ-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
265; AVX512DQ-NEXT:    vmovd %xmm1, %r11d
266; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
267; AVX512DQ-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
268; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm4
269; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
270; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
271; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
272; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
273; AVX512DQ-NEXT:    vpermps (%rdi), %zmm1, %zmm1
274; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm5
275; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm6
276; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
277; AVX512DQ-NEXT:    vextractf128 $1, %ymm7, %xmm7
278; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
279; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
280; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
281; AVX512DQ-NEXT:    vextractf128 $1, %ymm5, %xmm5
282; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
283; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
284; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
285; AVX512DQ-NEXT:    vmovq %xmm0, (%r8)
286; AVX512DQ-NEXT:    vmovlps %xmm1, (%r9)
287; AVX512DQ-NEXT:    vmovlps %xmm7, (%r10)
288; AVX512DQ-NEXT:    vmovlps %xmm5, (%rax)
289; AVX512DQ-NEXT:    vzeroupper
290; AVX512DQ-NEXT:    retq
291;
292; AVX512DQ-FCP-LABEL: load_i32_stride7_vf2:
293; AVX512DQ-FCP:       # %bb.0:
294; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
295; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
296; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm0
297; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
298; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
299; AVX512DQ-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
300; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
301; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
302; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
303; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
304; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
305; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
306; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
307; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
308; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
309; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
310; AVX512DQ-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
311; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
312; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
313; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
314; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
315; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
316; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
317; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
318; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%r9)
319; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%r10)
320; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rax)
321; AVX512DQ-FCP-NEXT:    vzeroupper
322; AVX512DQ-FCP-NEXT:    retq
323;
324; AVX512BW-LABEL: load_i32_stride7_vf2:
325; AVX512BW:       # %bb.0:
326; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
327; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
328; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
329; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm1
330; AVX512BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
331; AVX512BW-NEXT:    vmovd %xmm1, %r11d
332; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
333; AVX512BW-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
334; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
335; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
336; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
337; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
338; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
339; AVX512BW-NEXT:    vpermps (%rdi), %zmm1, %zmm1
340; AVX512BW-NEXT:    vmovaps (%rdi), %ymm5
341; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm6
342; AVX512BW-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
343; AVX512BW-NEXT:    vextractf128 $1, %ymm7, %xmm7
344; AVX512BW-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
345; AVX512BW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
346; AVX512BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
347; AVX512BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
348; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
349; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
350; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
351; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
352; AVX512BW-NEXT:    vmovlps %xmm1, (%r9)
353; AVX512BW-NEXT:    vmovlps %xmm7, (%r10)
354; AVX512BW-NEXT:    vmovlps %xmm5, (%rax)
355; AVX512BW-NEXT:    vzeroupper
356; AVX512BW-NEXT:    retq
357;
358; AVX512BW-FCP-LABEL: load_i32_stride7_vf2:
359; AVX512BW-FCP:       # %bb.0:
360; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
361; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
362; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
363; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
364; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
365; AVX512BW-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
366; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
367; AVX512BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
368; AVX512BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
369; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
370; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
371; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
372; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
373; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
374; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
375; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
376; AVX512BW-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
377; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
378; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
379; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
380; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
381; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
382; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
383; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
384; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
385; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r10)
386; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
387; AVX512BW-FCP-NEXT:    vzeroupper
388; AVX512BW-FCP-NEXT:    retq
389;
390; AVX512DQ-BW-LABEL: load_i32_stride7_vf2:
391; AVX512DQ-BW:       # %bb.0:
392; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
393; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
394; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
395; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm1
396; AVX512DQ-BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
397; AVX512DQ-BW-NEXT:    vmovd %xmm1, %r11d
398; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
399; AVX512DQ-BW-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
400; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
401; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
402; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
403; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
404; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
405; AVX512DQ-BW-NEXT:    vpermps (%rdi), %zmm1, %zmm1
406; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm5
407; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm6
408; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
409; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm7, %xmm7
410; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
411; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
412; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
413; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
414; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
415; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
416; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
417; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
418; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%r9)
419; AVX512DQ-BW-NEXT:    vmovlps %xmm7, (%r10)
420; AVX512DQ-BW-NEXT:    vmovlps %xmm5, (%rax)
421; AVX512DQ-BW-NEXT:    vzeroupper
422; AVX512DQ-BW-NEXT:    retq
423;
424; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf2:
425; AVX512DQ-BW-FCP:       # %bb.0:
426; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
427; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
428; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
429; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
430; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
431; AVX512DQ-BW-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
432; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
433; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
434; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
435; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
436; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
437; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
438; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
439; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
440; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
441; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
442; AVX512DQ-BW-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
443; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
444; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
445; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
446; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
447; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
448; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
449; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
450; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
451; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r10)
452; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
453; AVX512DQ-BW-FCP-NEXT:    vzeroupper
454; AVX512DQ-BW-FCP-NEXT:    retq
455  %wide.vec = load <14 x i32>, ptr %in.vec, align 64
456  %strided.vec0 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 0, i32 7>
457  %strided.vec1 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 1, i32 8>
458  %strided.vec2 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 2, i32 9>
459  %strided.vec3 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 3, i32 10>
460  %strided.vec4 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 4, i32 11>
461  %strided.vec5 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 5, i32 12>
462  %strided.vec6 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 6, i32 13>
463  store <2 x i32> %strided.vec0, ptr %out.vec0, align 64
464  store <2 x i32> %strided.vec1, ptr %out.vec1, align 64
465  store <2 x i32> %strided.vec2, ptr %out.vec2, align 64
466  store <2 x i32> %strided.vec3, ptr %out.vec3, align 64
467  store <2 x i32> %strided.vec4, ptr %out.vec4, align 64
468  store <2 x i32> %strided.vec5, ptr %out.vec5, align 64
469  store <2 x i32> %strided.vec6, ptr %out.vec6, align 64
470  ret void
471}
472
473define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
474; SSE-LABEL: load_i32_stride7_vf4:
475; SSE:       # %bb.0:
476; SSE-NEXT:    movdqa 96(%rdi), %xmm1
477; SSE-NEXT:    movdqa 64(%rdi), %xmm0
478; SSE-NEXT:    movdqa 80(%rdi), %xmm2
479; SSE-NEXT:    movdqa (%rdi), %xmm11
480; SSE-NEXT:    movdqa 16(%rdi), %xmm3
481; SSE-NEXT:    movdqa 32(%rdi), %xmm4
482; SSE-NEXT:    movdqa 48(%rdi), %xmm6
483; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[3,3,3,3]
484; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1]
485; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3]
486; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm11[2,2,3,3]
487; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
488; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
489; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
490; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1]
491; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
492; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[2,2,2,2]
493; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1]
494; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
495; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm6[1,1,1,1]
496; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm11[2],xmm6[3],xmm11[3]
497; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1]
498; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
499; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
500; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
501; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1]
502; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
503; SSE-NEXT:    movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1]
504; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1]
505; SSE-NEXT:    movdqa %xmm0, %xmm8
506; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
507; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
508; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
509; SSE-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
510; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1]
511; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
512; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
513; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
514; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3]
515; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
516; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
517; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
518; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
519; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
520; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
521; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
522; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
523; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
524; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
525; SSE-NEXT:    movapd %xmm5, (%rsi)
526; SSE-NEXT:    movapd %xmm6, (%rdx)
527; SSE-NEXT:    movapd %xmm11, (%rcx)
528; SSE-NEXT:    movapd %xmm8, (%r8)
529; SSE-NEXT:    movapd %xmm4, (%r9)
530; SSE-NEXT:    movapd %xmm0, (%rdi)
531; SSE-NEXT:    movapd %xmm2, (%rax)
532; SSE-NEXT:    retq
533;
534; AVX-LABEL: load_i32_stride7_vf4:
535; AVX:       # %bb.0:
536; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
537; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
538; AVX-NEXT:    vmovaps 32(%rdi), %ymm0
539; AVX-NEXT:    vmovaps (%rdi), %ymm1
540; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
541; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
542; AVX-NEXT:    vmovaps (%rdi), %xmm3
543; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
544; AVX-NEXT:    vmovaps 64(%rdi), %xmm5
545; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
546; AVX-NEXT:    vmovaps 80(%rdi), %xmm6
547; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3]
548; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,3,2,1]
549; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3]
550; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3]
551; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[2]
552; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm5[0,1,0,1]
553; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3]
554; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3]
555; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3]
556; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
557; AVX-NEXT:    vmovaps 96(%rdi), %xmm9
558; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3]
559; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
560; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3]
561; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
562; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1]
563; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
564; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
565; AVX-NEXT:    vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3]
566; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3]
567; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3]
568; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
569; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
570; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm10[2,0],xmm5[3,2]
571; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,0,1]
572; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3]
573; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
574; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
575; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3]
576; AVX-NEXT:    vmovaps %xmm2, (%rsi)
577; AVX-NEXT:    vmovaps %xmm7, (%rdx)
578; AVX-NEXT:    vmovaps %xmm8, (%rcx)
579; AVX-NEXT:    vmovaps %xmm3, (%r8)
580; AVX-NEXT:    vmovaps %xmm4, (%r9)
581; AVX-NEXT:    vmovaps %xmm5, (%r10)
582; AVX-NEXT:    vmovaps %xmm0, (%rax)
583; AVX-NEXT:    vzeroupper
584; AVX-NEXT:    retq
585;
586; AVX2-LABEL: load_i32_stride7_vf4:
587; AVX2:       # %bb.0:
588; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
589; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
590; AVX2-NEXT:    vmovaps (%rdi), %ymm0
591; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
592; AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [0,7,6,u]
593; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
594; AVX2-NEXT:    vpermps %ymm3, %ymm2, %ymm2
595; AVX2-NEXT:    vbroadcastss 84(%rdi), %xmm3
596; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
597; AVX2-NEXT:    vmovaps 80(%rdi), %xmm4
598; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
599; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
600; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7]
601; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3]
602; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3]
603; AVX2-NEXT:    vbroadcastss 8(%rdi), %xmm6
604; AVX2-NEXT:    vmovaps 32(%rdi), %xmm7
605; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
606; AVX2-NEXT:    vmovaps 64(%rdi), %xmm8
607; AVX2-NEXT:    vbroadcastss %xmm8, %xmm9
608; AVX2-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
609; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
610; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3]
611; AVX2-NEXT:    vmovaps 96(%rdi), %xmm7
612; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3]
613; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
614; AVX2-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
615; AVX2-NEXT:    vbroadcastss 100(%rdi), %xmm9
616; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
617; AVX2-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
618; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
619; AVX2-NEXT:    vpermps %ymm11, %ymm10, %ymm10
620; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
621; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3]
622; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
623; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7]
624; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
625; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3]
626; AVX2-NEXT:    vbroadcastss 80(%rdi), %ymm8
627; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
628; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
629; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
630; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
631; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
632; AVX2-NEXT:    vmovaps %xmm2, (%rsi)
633; AVX2-NEXT:    vmovaps %xmm3, (%rdx)
634; AVX2-NEXT:    vmovaps %xmm4, (%rcx)
635; AVX2-NEXT:    vmovaps %xmm6, (%r8)
636; AVX2-NEXT:    vmovaps %xmm9, (%r9)
637; AVX2-NEXT:    vmovaps %xmm5, (%r10)
638; AVX2-NEXT:    vmovaps %xmm0, (%rax)
639; AVX2-NEXT:    vzeroupper
640; AVX2-NEXT:    retq
641;
642; AVX2-FP-LABEL: load_i32_stride7_vf4:
643; AVX2-FP:       # %bb.0:
644; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
645; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
646; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
647; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
648; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm2 = [0,7,6,u]
649; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
650; AVX2-FP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
651; AVX2-FP-NEXT:    vbroadcastss 84(%rdi), %xmm3
652; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
653; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm4
654; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
655; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
656; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7]
657; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3]
658; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3]
659; AVX2-FP-NEXT:    vbroadcastss 8(%rdi), %xmm6
660; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm7
661; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
662; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm8
663; AVX2-FP-NEXT:    vbroadcastss %xmm8, %xmm9
664; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
665; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
666; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3]
667; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm7
668; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3]
669; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
670; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
671; AVX2-FP-NEXT:    vbroadcastss 100(%rdi), %xmm9
672; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
673; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
674; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
675; AVX2-FP-NEXT:    vpermps %ymm11, %ymm10, %ymm10
676; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
677; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3]
678; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
679; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7]
680; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
681; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3]
682; AVX2-FP-NEXT:    vbroadcastss 80(%rdi), %ymm8
683; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
684; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
685; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
686; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
687; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
688; AVX2-FP-NEXT:    vmovaps %xmm2, (%rsi)
689; AVX2-FP-NEXT:    vmovaps %xmm3, (%rdx)
690; AVX2-FP-NEXT:    vmovaps %xmm4, (%rcx)
691; AVX2-FP-NEXT:    vmovaps %xmm6, (%r8)
692; AVX2-FP-NEXT:    vmovaps %xmm9, (%r9)
693; AVX2-FP-NEXT:    vmovaps %xmm5, (%r10)
694; AVX2-FP-NEXT:    vmovaps %xmm0, (%rax)
695; AVX2-FP-NEXT:    vzeroupper
696; AVX2-FP-NEXT:    retq
697;
698; AVX2-FCP-LABEL: load_i32_stride7_vf4:
699; AVX2-FCP:       # %bb.0:
700; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
701; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
702; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
703; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
704; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm2 = [0,7,6,u]
705; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
706; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
707; AVX2-FCP-NEXT:    vbroadcastss 84(%rdi), %xmm3
708; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
709; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [1,0,7,0,1,0,7,0]
710; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
711; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
712; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
713; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm5
714; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2]
715; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3]
716; AVX2-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm6
717; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm7
718; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
719; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm8
720; AVX2-FCP-NEXT:    vbroadcastss %xmm8, %xmm9
721; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
722; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
723; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3]
724; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm7
725; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3]
726; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
727; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
728; AVX2-FCP-NEXT:    vbroadcastss 100(%rdi), %xmm9
729; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
730; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
731; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
732; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm10, %ymm10
733; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
734; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3]
735; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
736; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7]
737; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm4
738; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
739; AVX2-FCP-NEXT:    vbroadcastss 80(%rdi), %ymm8
740; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
741; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
742; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
743; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
744; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
745; AVX2-FCP-NEXT:    vmovaps %xmm2, (%rsi)
746; AVX2-FCP-NEXT:    vmovaps %xmm3, (%rdx)
747; AVX2-FCP-NEXT:    vmovaps %xmm5, (%rcx)
748; AVX2-FCP-NEXT:    vmovaps %xmm6, (%r8)
749; AVX2-FCP-NEXT:    vmovaps %xmm9, (%r9)
750; AVX2-FCP-NEXT:    vmovaps %xmm4, (%r10)
751; AVX2-FCP-NEXT:    vmovaps %xmm0, (%rax)
752; AVX2-FCP-NEXT:    vzeroupper
753; AVX2-FCP-NEXT:    retq
754;
755; AVX512-LABEL: load_i32_stride7_vf4:
756; AVX512:       # %bb.0:
757; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
758; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
759; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
760; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
761; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21]
762; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
763; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22]
764; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
765; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23]
766; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
767; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24]
768; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
769; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25]
770; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
771; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26]
772; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
773; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27]
774; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
775; AVX512-NEXT:    vmovdqa %xmm2, (%rsi)
776; AVX512-NEXT:    vmovdqa %xmm3, (%rdx)
777; AVX512-NEXT:    vmovdqa %xmm4, (%rcx)
778; AVX512-NEXT:    vmovdqa %xmm5, (%r8)
779; AVX512-NEXT:    vmovdqa %xmm6, (%r9)
780; AVX512-NEXT:    vmovdqa %xmm7, (%r10)
781; AVX512-NEXT:    vmovdqa %xmm8, (%rax)
782; AVX512-NEXT:    vzeroupper
783; AVX512-NEXT:    retq
784;
785; AVX512-FCP-LABEL: load_i32_stride7_vf4:
786; AVX512-FCP:       # %bb.0:
787; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
788; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
789; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
790; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
791; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21]
792; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
793; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22]
794; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
795; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23]
796; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
797; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24]
798; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
799; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25]
800; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
801; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26]
802; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
803; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27]
804; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
805; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
806; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
807; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
808; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%r8)
809; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r9)
810; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%r10)
811; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rax)
812; AVX512-FCP-NEXT:    vzeroupper
813; AVX512-FCP-NEXT:    retq
814;
815; AVX512DQ-LABEL: load_i32_stride7_vf4:
816; AVX512DQ:       # %bb.0:
817; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
818; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
819; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
820; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm1
821; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21]
822; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
823; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22]
824; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
825; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23]
826; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
827; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24]
828; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
829; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25]
830; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
831; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26]
832; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
833; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27]
834; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
835; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rsi)
836; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rdx)
837; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rcx)
838; AVX512DQ-NEXT:    vmovdqa %xmm5, (%r8)
839; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r9)
840; AVX512DQ-NEXT:    vmovdqa %xmm7, (%r10)
841; AVX512DQ-NEXT:    vmovdqa %xmm8, (%rax)
842; AVX512DQ-NEXT:    vzeroupper
843; AVX512DQ-NEXT:    retq
844;
845; AVX512DQ-FCP-LABEL: load_i32_stride7_vf4:
846; AVX512DQ-FCP:       # %bb.0:
847; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
848; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
849; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
850; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
851; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21]
852; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
853; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22]
854; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
855; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23]
856; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
857; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24]
858; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
859; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25]
860; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
861; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26]
862; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
863; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27]
864; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
865; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
866; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
867; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
868; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%r8)
869; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r9)
870; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%r10)
871; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%rax)
872; AVX512DQ-FCP-NEXT:    vzeroupper
873; AVX512DQ-FCP-NEXT:    retq
874;
875; AVX512BW-LABEL: load_i32_stride7_vf4:
876; AVX512BW:       # %bb.0:
877; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
878; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
879; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
880; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
881; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21]
882; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
883; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22]
884; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
885; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23]
886; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
887; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24]
888; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
889; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25]
890; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
891; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26]
892; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
893; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27]
894; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
895; AVX512BW-NEXT:    vmovdqa %xmm2, (%rsi)
896; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
897; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
898; AVX512BW-NEXT:    vmovdqa %xmm5, (%r8)
899; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
900; AVX512BW-NEXT:    vmovdqa %xmm7, (%r10)
901; AVX512BW-NEXT:    vmovdqa %xmm8, (%rax)
902; AVX512BW-NEXT:    vzeroupper
903; AVX512BW-NEXT:    retq
904;
905; AVX512BW-FCP-LABEL: load_i32_stride7_vf4:
906; AVX512BW-FCP:       # %bb.0:
907; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
908; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
909; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
910; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
911; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21]
912; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
913; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22]
914; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
915; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23]
916; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
917; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24]
918; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
919; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25]
920; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
921; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26]
922; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
923; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27]
924; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
925; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
926; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
927; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
928; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
929; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
930; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%r10)
931; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%rax)
932; AVX512BW-FCP-NEXT:    vzeroupper
933; AVX512BW-FCP-NEXT:    retq
934;
935; AVX512DQ-BW-LABEL: load_i32_stride7_vf4:
936; AVX512DQ-BW:       # %bb.0:
937; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
938; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
939; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
940; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
941; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21]
942; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
943; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22]
944; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
945; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23]
946; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
947; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24]
948; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
949; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25]
950; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
951; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26]
952; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
953; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27]
954; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
955; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rsi)
956; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
957; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
958; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%r8)
959; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
960; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, (%r10)
961; AVX512DQ-BW-NEXT:    vmovdqa %xmm8, (%rax)
962; AVX512DQ-BW-NEXT:    vzeroupper
963; AVX512DQ-BW-NEXT:    retq
964;
965; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf4:
966; AVX512DQ-BW-FCP:       # %bb.0:
967; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
968; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
969; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
970; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
971; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21]
972; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
973; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22]
974; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
975; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23]
976; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
977; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24]
978; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
979; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25]
980; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
981; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26]
982; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
983; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27]
984; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
985; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
986; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
987; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
988; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
989; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
990; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%r10)
991; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%rax)
992; AVX512DQ-BW-FCP-NEXT:    vzeroupper
993; AVX512DQ-BW-FCP-NEXT:    retq
994  %wide.vec = load <28 x i32>, ptr %in.vec, align 64
995  %strided.vec0 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
996  %strided.vec1 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
997  %strided.vec2 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23>
998  %strided.vec3 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24>
999  %strided.vec4 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25>
1000  %strided.vec5 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26>
1001  %strided.vec6 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27>
1002  store <4 x i32> %strided.vec0, ptr %out.vec0, align 64
1003  store <4 x i32> %strided.vec1, ptr %out.vec1, align 64
1004  store <4 x i32> %strided.vec2, ptr %out.vec2, align 64
1005  store <4 x i32> %strided.vec3, ptr %out.vec3, align 64
1006  store <4 x i32> %strided.vec4, ptr %out.vec4, align 64
1007  store <4 x i32> %strided.vec5, ptr %out.vec5, align 64
1008  store <4 x i32> %strided.vec6, ptr %out.vec6, align 64
1009  ret void
1010}
1011
1012define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
1013; SSE-LABEL: load_i32_stride7_vf8:
1014; SSE:       # %bb.0:
1015; SSE-NEXT:    subq $24, %rsp
1016; SSE-NEXT:    movdqa 144(%rdi), %xmm9
1017; SSE-NEXT:    movdqa 80(%rdi), %xmm5
1018; SSE-NEXT:    movdqa (%rdi), %xmm12
1019; SSE-NEXT:    movdqa 16(%rdi), %xmm11
1020; SSE-NEXT:    movdqa 48(%rdi), %xmm6
1021; SSE-NEXT:    movdqa 192(%rdi), %xmm8
1022; SSE-NEXT:    movdqa 160(%rdi), %xmm10
1023; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1024; SSE-NEXT:    movdqa 112(%rdi), %xmm15
1025; SSE-NEXT:    movdqa 128(%rdi), %xmm0
1026; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1027; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1028; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1]
1029; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3]
1030; SSE-NEXT:    movdqa %xmm15, %xmm3
1031; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1032; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
1033; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
1034; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
1035; SSE-NEXT:    movapd %xmm0, (%rsp) # 16-byte Spill
1036; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3]
1037; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1]
1038; SSE-NEXT:    movdqa %xmm12, %xmm4
1039; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
1040; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1041; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
1042; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
1043; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1044; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1045; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
1046; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2]
1047; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1048; SSE-NEXT:    movdqa %xmm10, %xmm4
1049; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1050; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
1051; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1052; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
1053; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1054; SSE-NEXT:    movdqa %xmm6, %xmm1
1055; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1056; SSE-NEXT:    movdqa 32(%rdi), %xmm4
1057; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1058; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
1059; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1060; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
1061; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1062; SSE-NEXT:    movdqa 176(%rdi), %xmm10
1063; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3]
1064; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm10[0,0,1,1]
1065; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
1066; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1]
1067; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
1068; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
1069; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1070; SSE-NEXT:    movdqa 64(%rdi), %xmm14
1071; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
1072; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm14[0,0,1,1]
1073; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
1074; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
1075; SSE-NEXT:    movdqa 208(%rdi), %xmm3
1076; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1]
1077; SSE-NEXT:    movdqa %xmm10, %xmm7
1078; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
1079; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3]
1080; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
1081; SSE-NEXT:    movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
1082; SSE-NEXT:    movdqa 96(%rdi), %xmm5
1083; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1]
1084; SSE-NEXT:    movdqa %xmm14, %xmm15
1085; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
1086; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
1087; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1088; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
1089; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3]
1090; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1091; SSE-NEXT:    movdqa %xmm6, %xmm12
1092; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
1093; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3]
1094; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
1095; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm12[0],xmm9[1]
1096; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
1097; SSE-NEXT:    movdqa %xmm11, %xmm12
1098; SSE-NEXT:    movdqa %xmm11, %xmm4
1099; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
1100; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3]
1101; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1102; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
1103; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
1104; SSE-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3]
1105; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1]
1106; SSE-NEXT:    movdqa %xmm6, %xmm11
1107; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1108; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1109; SSE-NEXT:    movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1]
1110; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2]
1111; SSE-NEXT:    punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
1112; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1]
1113; SSE-NEXT:    movdqa %xmm12, %xmm6
1114; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1115; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
1116; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1]
1117; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
1118; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[2,3,2,3]
1119; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
1120; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
1121; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
1122; SSE-NEXT:    # xmm4 = mem[0,0,1,1]
1123; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1124; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1]
1125; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
1126; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm6[2,3,2,3]
1127; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
1128; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
1129; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1130; SSE-NEXT:    # xmm3 = mem[0,0,1,1]
1131; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1132; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1]
1133; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1134; SSE-NEXT:    movaps %xmm1, (%rsi)
1135; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
1136; SSE-NEXT:    movaps %xmm1, 16(%rsi)
1137; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1138; SSE-NEXT:    movaps %xmm0, (%rdx)
1139; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1140; SSE-NEXT:    movaps %xmm0, 16(%rdx)
1141; SSE-NEXT:    movapd %xmm8, (%rcx)
1142; SSE-NEXT:    movapd %xmm13, 16(%rcx)
1143; SSE-NEXT:    movapd %xmm15, (%r8)
1144; SSE-NEXT:    movapd %xmm7, 16(%r8)
1145; SSE-NEXT:    movapd %xmm2, (%r9)
1146; SSE-NEXT:    movapd %xmm9, 16(%r9)
1147; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1148; SSE-NEXT:    movapd %xmm14, (%rax)
1149; SSE-NEXT:    movapd %xmm10, 16(%rax)
1150; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1151; SSE-NEXT:    movapd %xmm3, (%rax)
1152; SSE-NEXT:    movapd %xmm4, 16(%rax)
1153; SSE-NEXT:    addq $24, %rsp
1154; SSE-NEXT:    retq
1155;
1156; AVX-LABEL: load_i32_stride7_vf8:
1157; AVX:       # %bb.0:
1158; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
1159; AVX-NEXT:    vmovaps 128(%rdi), %ymm7
1160; AVX-NEXT:    vmovaps 64(%rdi), %ymm10
1161; AVX-NEXT:    vmovaps 32(%rdi), %ymm0
1162; AVX-NEXT:    vmovaps (%rdi), %ymm1
1163; AVX-NEXT:    vmovaps 96(%rdi), %ymm12
1164; AVX-NEXT:    vmovaps 80(%rdi), %xmm2
1165; AVX-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
1166; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
1167; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
1168; AVX-NEXT:    vmovaps (%rdi), %xmm14
1169; AVX-NEXT:    vmovaps 32(%rdi), %xmm9
1170; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
1171; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3]
1172; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm5[0,1,2],ymm3[3,4,5,6,7]
1173; AVX-NEXT:    vmovaps 160(%rdi), %xmm3
1174; AVX-NEXT:    vmovaps 128(%rdi), %xmm5
1175; AVX-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm5[1],xmm3[1]
1176; AVX-NEXT:    vmovaps 192(%rdi), %xmm11
1177; AVX-NEXT:    vinsertps {{.*#+}} xmm8 = zero,xmm8[1,2],xmm11[1]
1178; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
1179; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7]
1180; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm12[1,1],ymm10[2,2],ymm12[5,5],ymm10[6,6]
1181; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3]
1182; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm9[0],xmm14[1],xmm9[2,3]
1183; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[1,0],mem[3,3]
1184; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4,5,6,7]
1185; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm7[2,3],ymm4[0,1]
1186; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm13[3,3],ymm7[4,4],ymm13[7,7]
1187; AVX-NEXT:    vextractf128 $1, %ymm13, %xmm13
1188; AVX-NEXT:    vinsertps {{.*#+}} xmm13 = zero,xmm13[1,2],xmm11[2]
1189; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
1190; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7]
1191; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm14[2,3,2,3]
1192; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3]
1193; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm10[0,3],ymm2[7,5],ymm10[4,7]
1194; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4]
1195; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,3,4,5,6,7]
1196; AVX-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
1197; AVX-NEXT:    vextractf128 $1, %ymm13, %xmm13
1198; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
1199; AVX-NEXT:    vmovaps 192(%rdi), %ymm13
1200; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
1201; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4],ymm11[5,6,7]
1202; AVX-NEXT:    vmovaps 64(%rdi), %xmm15
1203; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm12[0,0],ymm10[5,4],ymm12[4,4]
1204; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6]
1205; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm14[3]
1206; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
1207; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7]
1208; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[0,1],ymm4[1,3],ymm13[4,5],ymm4[5,7]
1209; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm7[0,2],ymm12[2,0],ymm7[4,6],ymm12[6,4]
1210; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7]
1211; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4]
1212; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm7[2,3,0,1]
1213; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm14[0,0],ymm7[7,4],ymm14[4,4]
1214; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm12[2,0],ymm7[6,4],ymm12[6,4]
1215; AVX-NEXT:    vmovaps 96(%rdi), %xmm12
1216; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm12[0,1,0,1]
1217; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
1218; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
1219; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
1220; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3]
1221; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1222; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[2,1],ymm4[3,3],ymm13[6,5],ymm4[7,7]
1223; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm3[0],xmm5[1],xmm3[2,3]
1224; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
1225; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4]
1226; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3]
1227; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
1228; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
1229; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm14[2,0],xmm9[3,2]
1230; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
1231; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
1232; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3]
1233; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
1234; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1235; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
1236; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm13[2,3,0,1]
1237; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[3,0],ymm1[0,0],ymm13[7,4],ymm1[4,4]
1238; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
1239; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
1240; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
1241; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
1242; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1243; AVX-NEXT:    vmovaps %ymm6, (%rsi)
1244; AVX-NEXT:    vmovaps %ymm8, (%rdx)
1245; AVX-NEXT:    vmovaps %ymm11, (%rcx)
1246; AVX-NEXT:    vmovaps %ymm10, (%r8)
1247; AVX-NEXT:    vmovaps %ymm7, (%r9)
1248; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1249; AVX-NEXT:    vmovaps %ymm4, (%rax)
1250; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1251; AVX-NEXT:    vmovaps %ymm0, (%rax)
1252; AVX-NEXT:    vzeroupper
1253; AVX-NEXT:    retq
1254;
1255; AVX2-LABEL: load_i32_stride7_vf8:
1256; AVX2:       # %bb.0:
1257; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1258; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1259; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm9
1260; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm4
1261; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm5
1262; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1263; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
1264; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm10
1265; AVX2-NEXT:    vpbroadcastq 80(%rdi), %ymm2
1266; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1267; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0]
1268; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
1269; AVX2-NEXT:    vpermd %ymm6, %ymm3, %ymm3
1270; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
1271; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm6
1272; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm3
1273; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm3[1]
1274; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
1275; AVX2-NEXT:    vpbroadcastd 196(%rdi), %ymm7
1276; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
1277; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7]
1278; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = mem[2,2,2,2]
1279; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
1280; AVX2-NEXT:    vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
1281; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
1282; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
1283; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
1284; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
1285; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7]
1286; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0]
1287; AVX2-NEXT:    vpermd %ymm7, %ymm11, %ymm7
1288; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1289; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm7
1290; AVX2-NEXT:    vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1291; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3],ymm11[4,5,6,7]
1292; AVX2-NEXT:    vpbroadcastd 8(%rdi), %xmm11
1293; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm12
1294; AVX2-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
1295; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7]
1296; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
1297; AVX2-NEXT:    vpbroadcastd 204(%rdi), %ymm13
1298; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7]
1299; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7]
1300; AVX2-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],mem[3]
1301; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[3,2,2,3]
1302; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7]
1303; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,1,1,0,7,5,5,4]
1304; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7]
1305; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7]
1306; AVX2-NEXT:    vbroadcastss 208(%rdi), %ymm11
1307; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
1308; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7]
1309; AVX2-NEXT:    vpbroadcastd 100(%rdi), %xmm10
1310; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm11
1311; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
1312; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0]
1313; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1314; AVX2-NEXT:    vpermd %ymm13, %ymm12, %ymm12
1315; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
1316; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7]
1317; AVX2-NEXT:    vpermd %ymm5, %ymm12, %ymm13
1318; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7]
1319; AVX2-NEXT:    vpbroadcastd 212(%rdi), %ymm14
1320; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
1321; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
1322; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
1323; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
1324; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
1325; AVX2-NEXT:    vpbroadcastd 216(%rdi), %ymm5
1326; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
1327; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm5
1328; AVX2-NEXT:    vpblendd {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3]
1329; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
1330; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,3,5,4,6,7]
1331; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm8
1332; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3]
1333; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
1334; AVX2-NEXT:    vpermd 192(%rdi), %ymm12, %ymm8
1335; AVX2-NEXT:    vpbroadcastd 136(%rdi), %xmm11
1336; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3]
1337; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
1338; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
1339; AVX2-NEXT:    vpbroadcastd 80(%rdi), %ymm8
1340; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
1341; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
1342; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1343; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1344; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
1345; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1346; AVX2-NEXT:    vmovdqa %ymm2, (%rsi)
1347; AVX2-NEXT:    vmovdqa %ymm6, (%rdx)
1348; AVX2-NEXT:    vmovdqa %ymm7, (%rcx)
1349; AVX2-NEXT:    vmovdqa %ymm9, (%r8)
1350; AVX2-NEXT:    vmovdqa %ymm10, (%r9)
1351; AVX2-NEXT:    vmovdqa %ymm4, (%r10)
1352; AVX2-NEXT:    vmovdqa %ymm0, (%rax)
1353; AVX2-NEXT:    vzeroupper
1354; AVX2-NEXT:    retq
1355;
1356; AVX2-FP-LABEL: load_i32_stride7_vf8:
1357; AVX2-FP:       # %bb.0:
1358; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1359; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1360; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm9
1361; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm4
1362; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm5
1363; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
1364; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
1365; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm10
1366; AVX2-FP-NEXT:    vpbroadcastq 80(%rdi), %ymm2
1367; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1368; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0]
1369; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
1370; AVX2-FP-NEXT:    vpermd %ymm6, %ymm3, %ymm3
1371; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
1372; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm6
1373; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm3
1374; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm3[1]
1375; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
1376; AVX2-FP-NEXT:    vpbroadcastd 196(%rdi), %ymm7
1377; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
1378; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7]
1379; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = mem[2,2,2,2]
1380; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
1381; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
1382; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
1383; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
1384; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
1385; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
1386; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7]
1387; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0]
1388; AVX2-FP-NEXT:    vpermd %ymm7, %ymm11, %ymm7
1389; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1390; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm7
1391; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1392; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3],ymm11[4,5,6,7]
1393; AVX2-FP-NEXT:    vpbroadcastd 8(%rdi), %xmm11
1394; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm12
1395; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
1396; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7]
1397; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
1398; AVX2-FP-NEXT:    vpbroadcastd 204(%rdi), %ymm13
1399; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7]
1400; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7]
1401; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],mem[3]
1402; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[3,2,2,3]
1403; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7]
1404; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,1,1,0,7,5,5,4]
1405; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7]
1406; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7]
1407; AVX2-FP-NEXT:    vbroadcastss 208(%rdi), %ymm11
1408; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
1409; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7]
1410; AVX2-FP-NEXT:    vpbroadcastd 100(%rdi), %xmm10
1411; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm11
1412; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
1413; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0]
1414; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1415; AVX2-FP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
1416; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
1417; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7]
1418; AVX2-FP-NEXT:    vpermd %ymm5, %ymm12, %ymm13
1419; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7]
1420; AVX2-FP-NEXT:    vpbroadcastd 212(%rdi), %ymm14
1421; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
1422; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
1423; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
1424; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
1425; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
1426; AVX2-FP-NEXT:    vpbroadcastd 216(%rdi), %ymm5
1427; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
1428; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm5
1429; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3]
1430; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
1431; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,3,5,4,6,7]
1432; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm8
1433; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3]
1434; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
1435; AVX2-FP-NEXT:    vpermd 192(%rdi), %ymm12, %ymm8
1436; AVX2-FP-NEXT:    vpbroadcastd 136(%rdi), %xmm11
1437; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3]
1438; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
1439; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
1440; AVX2-FP-NEXT:    vpbroadcastd 80(%rdi), %ymm8
1441; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
1442; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
1443; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1444; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
1445; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
1446; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1447; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rsi)
1448; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rdx)
1449; AVX2-FP-NEXT:    vmovdqa %ymm7, (%rcx)
1450; AVX2-FP-NEXT:    vmovdqa %ymm9, (%r8)
1451; AVX2-FP-NEXT:    vmovdqa %ymm10, (%r9)
1452; AVX2-FP-NEXT:    vmovdqa %ymm4, (%r10)
1453; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rax)
1454; AVX2-FP-NEXT:    vzeroupper
1455; AVX2-FP-NEXT:    retq
1456;
1457; AVX2-FCP-LABEL: load_i32_stride7_vf8:
1458; AVX2-FCP:       # %bb.0:
1459; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1460; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1461; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm9
1462; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
1463; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
1464; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1465; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1466; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm10
1467; AVX2-FCP-NEXT:    vpbroadcastq 80(%rdi), %ymm2
1468; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1469; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0]
1470; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
1471; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm3, %ymm3
1472; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
1473; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm6
1474; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm3
1475; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm3[1]
1476; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
1477; AVX2-FCP-NEXT:    vpbroadcastd 196(%rdi), %ymm7
1478; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
1479; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7]
1480; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = mem[2,2,2,2]
1481; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
1482; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
1483; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
1484; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
1485; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
1486; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
1487; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7]
1488; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0]
1489; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm11, %ymm7
1490; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1491; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm7
1492; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1493; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3],ymm11[4,5,6,7]
1494; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm11
1495; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
1496; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
1497; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7]
1498; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
1499; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm13
1500; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7]
1501; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7]
1502; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],mem[3]
1503; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[3,2,2,3]
1504; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7]
1505; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,1,1,0,7,5,5,4]
1506; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7]
1507; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7]
1508; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm11
1509; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
1510; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7]
1511; AVX2-FCP-NEXT:    vpbroadcastd 100(%rdi), %xmm10
1512; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm11
1513; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
1514; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0]
1515; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1516; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
1517; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
1518; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7]
1519; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm12, %ymm13
1520; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7]
1521; AVX2-FCP-NEXT:    vpbroadcastd 212(%rdi), %ymm14
1522; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
1523; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
1524; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
1525; AVX2-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,0,1,7]
1526; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm4
1527; AVX2-FCP-NEXT:    vpbroadcastd 216(%rdi), %ymm5
1528; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
1529; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
1530; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3]
1531; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
1532; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,3,5,4,6,7]
1533; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
1534; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3]
1535; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
1536; AVX2-FCP-NEXT:    vpermd 192(%rdi), %ymm12, %ymm8
1537; AVX2-FCP-NEXT:    vpbroadcastd 136(%rdi), %xmm11
1538; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3]
1539; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
1540; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
1541; AVX2-FCP-NEXT:    vpbroadcastd 80(%rdi), %ymm8
1542; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
1543; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
1544; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1545; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
1546; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
1547; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1548; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
1549; AVX2-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
1550; AVX2-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
1551; AVX2-FCP-NEXT:    vmovdqa %ymm9, (%r8)
1552; AVX2-FCP-NEXT:    vmovdqa %ymm10, (%r9)
1553; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%r10)
1554; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1555; AVX2-FCP-NEXT:    vzeroupper
1556; AVX2-FCP-NEXT:    retq
1557;
1558; AVX512-LABEL: load_i32_stride7_vf8:
1559; AVX512:       # %bb.0:
1560; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1561; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1562; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
1563; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
1564; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm2
1565; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm3
1566; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17]
1567; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1568; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0]
1569; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1570; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1571; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18]
1572; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1573; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0]
1574; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1575; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1576; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19]
1577; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1578; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
1579; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
1580; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1581; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20]
1582; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1583; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0]
1584; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
1585; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
1586; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21]
1587; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1588; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25]
1589; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1590; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1591; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22]
1592; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1593; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26]
1594; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1595; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1596; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23]
1597; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1598; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27]
1599; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1600; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1601; AVX512-NEXT:    vmovdqa %ymm4, (%rsi)
1602; AVX512-NEXT:    vmovdqa %ymm5, (%rdx)
1603; AVX512-NEXT:    vmovdqa %ymm6, (%rcx)
1604; AVX512-NEXT:    vmovdqa %ymm7, (%r8)
1605; AVX512-NEXT:    vmovdqa %ymm8, (%r9)
1606; AVX512-NEXT:    vmovdqa %ymm9, (%r10)
1607; AVX512-NEXT:    vmovdqa %ymm0, (%rax)
1608; AVX512-NEXT:    vzeroupper
1609; AVX512-NEXT:    retq
1610;
1611; AVX512-FCP-LABEL: load_i32_stride7_vf8:
1612; AVX512-FCP:       # %bb.0:
1613; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1614; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1615; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1616; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1617; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1618; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
1619; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17]
1620; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1621; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0]
1622; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1623; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1624; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18]
1625; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1626; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0]
1627; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1628; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1629; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19]
1630; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1631; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
1632; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
1633; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1634; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20]
1635; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1636; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0]
1637; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
1638; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
1639; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21]
1640; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1641; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25]
1642; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1643; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1644; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22]
1645; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1646; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26]
1647; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1648; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1649; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23]
1650; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1651; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27]
1652; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1653; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1654; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
1655; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
1656; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
1657; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r8)
1658; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r9)
1659; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%r10)
1660; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1661; AVX512-FCP-NEXT:    vzeroupper
1662; AVX512-FCP-NEXT:    retq
1663;
1664; AVX512DQ-LABEL: load_i32_stride7_vf8:
1665; AVX512DQ:       # %bb.0:
1666; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1667; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1668; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
1669; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm1
1670; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm2
1671; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm3
1672; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17]
1673; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1674; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0]
1675; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1676; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1677; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18]
1678; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1679; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0]
1680; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1681; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1682; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19]
1683; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1684; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
1685; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
1686; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1687; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20]
1688; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1689; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0]
1690; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
1691; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
1692; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21]
1693; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1694; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25]
1695; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1696; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1697; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22]
1698; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1699; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26]
1700; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1701; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1702; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23]
1703; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1704; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27]
1705; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1706; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1707; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rsi)
1708; AVX512DQ-NEXT:    vmovdqa %ymm5, (%rdx)
1709; AVX512DQ-NEXT:    vmovdqa %ymm6, (%rcx)
1710; AVX512DQ-NEXT:    vmovdqa %ymm7, (%r8)
1711; AVX512DQ-NEXT:    vmovdqa %ymm8, (%r9)
1712; AVX512DQ-NEXT:    vmovdqa %ymm9, (%r10)
1713; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rax)
1714; AVX512DQ-NEXT:    vzeroupper
1715; AVX512DQ-NEXT:    retq
1716;
1717; AVX512DQ-FCP-LABEL: load_i32_stride7_vf8:
1718; AVX512DQ-FCP:       # %bb.0:
1719; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1720; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1721; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1722; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1723; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1724; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
1725; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17]
1726; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1727; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0]
1728; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1729; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1730; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18]
1731; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1732; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0]
1733; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1734; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1735; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19]
1736; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1737; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
1738; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
1739; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1740; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20]
1741; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1742; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0]
1743; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
1744; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
1745; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21]
1746; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1747; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25]
1748; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1749; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1750; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22]
1751; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1752; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26]
1753; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1754; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1755; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23]
1756; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1757; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27]
1758; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1759; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1760; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
1761; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
1762; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
1763; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r8)
1764; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r9)
1765; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%r10)
1766; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1767; AVX512DQ-FCP-NEXT:    vzeroupper
1768; AVX512DQ-FCP-NEXT:    retq
1769;
1770; AVX512BW-LABEL: load_i32_stride7_vf8:
1771; AVX512BW:       # %bb.0:
1772; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1773; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1774; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1775; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1776; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
1777; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
1778; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17]
1779; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1780; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0]
1781; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1782; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1783; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18]
1784; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1785; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0]
1786; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1787; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1788; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19]
1789; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1790; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
1791; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
1792; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1793; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20]
1794; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1795; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0]
1796; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
1797; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
1798; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21]
1799; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1800; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25]
1801; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1802; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1803; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22]
1804; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1805; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26]
1806; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1807; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1808; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23]
1809; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1810; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27]
1811; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1812; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1813; AVX512BW-NEXT:    vmovdqa %ymm4, (%rsi)
1814; AVX512BW-NEXT:    vmovdqa %ymm5, (%rdx)
1815; AVX512BW-NEXT:    vmovdqa %ymm6, (%rcx)
1816; AVX512BW-NEXT:    vmovdqa %ymm7, (%r8)
1817; AVX512BW-NEXT:    vmovdqa %ymm8, (%r9)
1818; AVX512BW-NEXT:    vmovdqa %ymm9, (%r10)
1819; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
1820; AVX512BW-NEXT:    vzeroupper
1821; AVX512BW-NEXT:    retq
1822;
1823; AVX512BW-FCP-LABEL: load_i32_stride7_vf8:
1824; AVX512BW-FCP:       # %bb.0:
1825; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1826; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1827; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1828; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1829; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1830; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
1831; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17]
1832; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1833; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0]
1834; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1835; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1836; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18]
1837; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1838; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0]
1839; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1840; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1841; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19]
1842; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1843; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
1844; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
1845; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1846; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20]
1847; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1848; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0]
1849; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
1850; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
1851; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21]
1852; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1853; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25]
1854; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1855; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1856; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22]
1857; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1858; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26]
1859; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1860; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1861; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23]
1862; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1863; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27]
1864; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1865; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1866; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
1867; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
1868; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
1869; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
1870; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
1871; AVX512BW-FCP-NEXT:    vmovdqa %ymm9, (%r10)
1872; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1873; AVX512BW-FCP-NEXT:    vzeroupper
1874; AVX512BW-FCP-NEXT:    retq
1875;
1876; AVX512DQ-BW-LABEL: load_i32_stride7_vf8:
1877; AVX512DQ-BW:       # %bb.0:
1878; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1879; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1880; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1881; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1882; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
1883; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
1884; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17]
1885; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1886; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0]
1887; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1888; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1889; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18]
1890; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1891; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0]
1892; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1893; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1894; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19]
1895; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1896; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
1897; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
1898; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1899; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20]
1900; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1901; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0]
1902; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
1903; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
1904; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21]
1905; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1906; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25]
1907; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1908; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1909; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22]
1910; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1911; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26]
1912; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1913; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1914; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23]
1915; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1916; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27]
1917; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1918; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1919; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rsi)
1920; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%rdx)
1921; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%rcx)
1922; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%r8)
1923; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%r9)
1924; AVX512DQ-BW-NEXT:    vmovdqa %ymm9, (%r10)
1925; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rax)
1926; AVX512DQ-BW-NEXT:    vzeroupper
1927; AVX512DQ-BW-NEXT:    retq
1928;
1929; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf8:
1930; AVX512DQ-BW-FCP:       # %bb.0:
1931; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1932; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1933; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1934; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1935; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1936; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
1937; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17]
1938; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1939; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0]
1940; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
1941; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1942; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18]
1943; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
1944; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0]
1945; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
1946; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1947; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19]
1948; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1949; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
1950; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
1951; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
1952; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20]
1953; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
1954; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0]
1955; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
1956; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
1957; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21]
1958; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
1959; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25]
1960; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
1961; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1962; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22]
1963; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
1964; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26]
1965; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
1966; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1967; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23]
1968; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
1969; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27]
1970; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
1971; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7]
1972; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
1973; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
1974; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
1975; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
1976; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%r9)
1977; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm9, (%r10)
1978; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1979; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1980; AVX512DQ-BW-FCP-NEXT:    retq
1981  %wide.vec = load <56 x i32>, ptr %in.vec, align 64
1982  %strided.vec0 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49>
1983  %strided.vec1 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50>
1984  %strided.vec2 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51>
1985  %strided.vec3 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52>
1986  %strided.vec4 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53>
1987  %strided.vec5 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54>
1988  %strided.vec6 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55>
1989  store <8 x i32> %strided.vec0, ptr %out.vec0, align 64
1990  store <8 x i32> %strided.vec1, ptr %out.vec1, align 64
1991  store <8 x i32> %strided.vec2, ptr %out.vec2, align 64
1992  store <8 x i32> %strided.vec3, ptr %out.vec3, align 64
1993  store <8 x i32> %strided.vec4, ptr %out.vec4, align 64
1994  store <8 x i32> %strided.vec5, ptr %out.vec5, align 64
1995  store <8 x i32> %strided.vec6, ptr %out.vec6, align 64
1996  ret void
1997}
1998
1999define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
2000; SSE-LABEL: load_i32_stride7_vf16:
2001; SSE:       # %bb.0:
2002; SSE-NEXT:    subq $440, %rsp # imm = 0x1B8
2003; SSE-NEXT:    movdqa 304(%rdi), %xmm3
2004; SSE-NEXT:    movdqa 272(%rdi), %xmm5
2005; SSE-NEXT:    movdqa 224(%rdi), %xmm15
2006; SSE-NEXT:    movdqa 240(%rdi), %xmm6
2007; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2008; SSE-NEXT:    movdqa 80(%rdi), %xmm7
2009; SSE-NEXT:    movdqa (%rdi), %xmm2
2010; SSE-NEXT:    movdqa 16(%rdi), %xmm8
2011; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2012; SSE-NEXT:    movdqa 48(%rdi), %xmm9
2013; SSE-NEXT:    movdqa 192(%rdi), %xmm14
2014; SSE-NEXT:    movdqa 160(%rdi), %xmm12
2015; SSE-NEXT:    movdqa 112(%rdi), %xmm4
2016; SSE-NEXT:    movdqa 128(%rdi), %xmm0
2017; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2018; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
2019; SSE-NEXT:    movdqa %xmm4, %xmm1
2020; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2021; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
2022; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2023; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
2024; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2025; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2026; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3]
2027; SSE-NEXT:    movdqa %xmm2, %xmm1
2028; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2029; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
2030; SSE-NEXT:    movdqa %xmm9, %xmm11
2031; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2032; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
2033; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2034; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2035; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
2036; SSE-NEXT:    movdqa %xmm15, %xmm1
2037; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2038; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
2039; SSE-NEXT:    movdqa %xmm5, %xmm9
2040; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2041; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2042; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2043; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2044; SSE-NEXT:    movdqa 336(%rdi), %xmm1
2045; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2046; SSE-NEXT:    movdqa 352(%rdi), %xmm0
2047; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2048; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
2049; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2050; SSE-NEXT:    movdqa 416(%rdi), %xmm8
2051; SSE-NEXT:    movdqa 384(%rdi), %xmm13
2052; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
2053; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2054; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
2055; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2056; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2057; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
2058; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2059; SSE-NEXT:    movdqa %xmm12, %xmm1
2060; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2061; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2062; SSE-NEXT:    movdqa %xmm4, %xmm5
2063; SSE-NEXT:    movdqa 144(%rdi), %xmm4
2064; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2065; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2066; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2067; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2068; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
2069; SSE-NEXT:    movdqa %xmm7, %xmm12
2070; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2071; SSE-NEXT:    movdqa %xmm11, %xmm1
2072; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2073; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
2074; SSE-NEXT:    movdqa %xmm2, %xmm10
2075; SSE-NEXT:    movdqa 32(%rdi), %xmm7
2076; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
2077; SSE-NEXT:    movdqa %xmm7, (%rsp) # 16-byte Spill
2078; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2079; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2080; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2081; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
2082; SSE-NEXT:    movdqa %xmm9, %xmm1
2083; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2084; SSE-NEXT:    movdqa %xmm15, %xmm11
2085; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2086; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
2087; SSE-NEXT:    movdqa 256(%rdi), %xmm15
2088; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
2089; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2090; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2091; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2]
2092; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2093; SSE-NEXT:    movdqa %xmm13, %xmm1
2094; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2095; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2096; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
2097; SSE-NEXT:    movdqa 368(%rdi), %xmm2
2098; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2099; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2100; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2101; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
2102; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
2103; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2104; SSE-NEXT:    movdqa 176(%rdi), %xmm4
2105; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3]
2106; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1]
2107; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
2108; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
2109; SSE-NEXT:    movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2110; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
2111; SSE-NEXT:    movdqa %xmm10, %xmm14
2112; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1]
2113; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2114; SSE-NEXT:    movdqa 64(%rdi), %xmm9
2115; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
2116; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1]
2117; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
2118; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
2119; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2120; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3]
2121; SSE-NEXT:    movdqa %xmm15, %xmm12
2122; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1]
2123; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2124; SSE-NEXT:    movdqa 288(%rdi), %xmm15
2125; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
2126; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1]
2127; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
2128; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2129; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2130; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[2,3,2,3]
2131; SSE-NEXT:    movdqa %xmm13, %xmm1
2132; SSE-NEXT:    movdqa %xmm2, %xmm11
2133; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
2134; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
2135; SSE-NEXT:    movdqa 400(%rdi), %xmm13
2136; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3]
2137; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1]
2138; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
2139; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1]
2140; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2141; SSE-NEXT:    movdqa 208(%rdi), %xmm10
2142; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm10[0,0,1,1]
2143; SSE-NEXT:    movdqa %xmm4, %xmm2
2144; SSE-NEXT:    movdqa %xmm4, %xmm0
2145; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
2146; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
2147; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2148; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
2149; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2150; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2151; SSE-NEXT:    movdqa 96(%rdi), %xmm0
2152; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2153; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1]
2154; SSE-NEXT:    movdqa %xmm9, %xmm3
2155; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2156; SSE-NEXT:    movdqa %xmm9, %xmm0
2157; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2158; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3]
2159; SSE-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
2160; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2161; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2162; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2163; SSE-NEXT:    movdqa 320(%rdi), %xmm7
2164; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,1,1]
2165; SSE-NEXT:    movdqa %xmm15, %xmm0
2166; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2167; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2168; SSE-NEXT:    # xmm4 = mem[2,2,3,3]
2169; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
2170; SSE-NEXT:    movdqa %xmm12, %xmm14
2171; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2172; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2173; SSE-NEXT:    movdqa 432(%rdi), %xmm0
2174; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2175; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2176; SSE-NEXT:    movdqa %xmm13, %xmm4
2177; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
2178; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
2179; SSE-NEXT:    movdqa %xmm11, %xmm9
2180; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
2181; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
2182; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2183; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
2184; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2185; SSE-NEXT:    movdqa %xmm8, %xmm5
2186; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2187; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
2188; SSE-NEXT:    movdqa %xmm2, %xmm1
2189; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
2190; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
2191; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2192; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3]
2193; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2194; SSE-NEXT:    movdqa %xmm4, %xmm0
2195; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
2196; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm3[2,2,3,3]
2197; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2198; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
2199; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
2200; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3]
2201; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2202; SSE-NEXT:    movdqa %xmm6, %xmm5
2203; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2204; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3]
2205; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1]
2206; SSE-NEXT:    movdqa %xmm7, %xmm11
2207; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2208; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1]
2209; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3]
2210; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2211; SSE-NEXT:    movdqa %xmm7, %xmm5
2212; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2213; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm13[2,2,3,3]
2214; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2215; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
2216; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1]
2217; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
2218; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2219; SSE-NEXT:    movdqa %xmm8, %xmm5
2220; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2221; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2222; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
2223; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2224; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2225; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
2226; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2227; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2228; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2229; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2230; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2231; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2232; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2233; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2]
2234; SSE-NEXT:    punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2235; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2236; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2237; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2238; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
2239; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
2240; SSE-NEXT:    movdqa %xmm3, %xmm11
2241; SSE-NEXT:    punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
2242; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
2243; SSE-NEXT:    movdqa %xmm7, %xmm2
2244; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2245; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2246; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
2247; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2248; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
2249; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2250; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
2251; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2252; SSE-NEXT:    # xmm10 = mem[0,0,1,1]
2253; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
2254; SSE-NEXT:    movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1]
2255; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2256; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2257; SSE-NEXT:    # xmm5 = mem[2,3,2,3]
2258; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2259; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2260; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
2261; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2262; SSE-NEXT:    # xmm7 = mem[0,0,1,1]
2263; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2264; SSE-NEXT:    movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1]
2265; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2266; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
2267; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2268; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2269; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
2270; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2271; SSE-NEXT:    # xmm6 = mem[0,0,1,1]
2272; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
2273; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1]
2274; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
2275; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
2276; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2277; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3]
2278; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2279; SSE-NEXT:    # xmm4 = mem[0,0,1,1]
2280; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
2281; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
2282; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2283; SSE-NEXT:    movaps %xmm0, 48(%rsi)
2284; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2285; SSE-NEXT:    movaps %xmm0, 32(%rsi)
2286; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2287; SSE-NEXT:    movaps %xmm0, (%rsi)
2288; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2289; SSE-NEXT:    movaps %xmm0, 16(%rsi)
2290; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2291; SSE-NEXT:    movaps %xmm0, 48(%rdx)
2292; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2293; SSE-NEXT:    movaps %xmm0, 32(%rdx)
2294; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2295; SSE-NEXT:    movaps %xmm0, (%rdx)
2296; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2297; SSE-NEXT:    movaps %xmm0, 16(%rdx)
2298; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2299; SSE-NEXT:    movaps %xmm0, 48(%rcx)
2300; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2301; SSE-NEXT:    movaps %xmm0, 32(%rcx)
2302; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2303; SSE-NEXT:    movaps %xmm0, (%rcx)
2304; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2305; SSE-NEXT:    movaps %xmm0, 16(%rcx)
2306; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2307; SSE-NEXT:    movaps %xmm0, 48(%r8)
2308; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2309; SSE-NEXT:    movaps %xmm0, 32(%r8)
2310; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2311; SSE-NEXT:    movaps %xmm0, (%r8)
2312; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2313; SSE-NEXT:    movaps %xmm0, 16(%r8)
2314; SSE-NEXT:    movapd %xmm9, 48(%r9)
2315; SSE-NEXT:    movapd %xmm14, 32(%r9)
2316; SSE-NEXT:    movapd %xmm12, (%r9)
2317; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2318; SSE-NEXT:    movaps %xmm0, 16(%r9)
2319; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2320; SSE-NEXT:    movapd %xmm13, 48(%rax)
2321; SSE-NEXT:    movapd %xmm15, 32(%rax)
2322; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2323; SSE-NEXT:    movaps %xmm0, (%rax)
2324; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2325; SSE-NEXT:    movaps %xmm0, 16(%rax)
2326; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2327; SSE-NEXT:    movapd %xmm4, 48(%rax)
2328; SSE-NEXT:    movapd %xmm6, 32(%rax)
2329; SSE-NEXT:    movapd %xmm7, (%rax)
2330; SSE-NEXT:    movapd %xmm10, 16(%rax)
2331; SSE-NEXT:    addq $440, %rsp # imm = 0x1B8
2332; SSE-NEXT:    retq
2333;
2334; AVX-LABEL: load_i32_stride7_vf16:
2335; AVX:       # %bb.0:
2336; AVX-NEXT:    subq $456, %rsp # imm = 0x1C8
2337; AVX-NEXT:    vmovaps 32(%rdi), %ymm4
2338; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2339; AVX-NEXT:    vmovaps (%rdi), %ymm6
2340; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2341; AVX-NEXT:    vmovaps 96(%rdi), %ymm15
2342; AVX-NEXT:    vmovaps 256(%rdi), %ymm2
2343; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2344; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
2345; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2346; AVX-NEXT:    vmovaps 320(%rdi), %ymm5
2347; AVX-NEXT:    vmovaps 304(%rdi), %xmm0
2348; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2349; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
2350; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
2351; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
2352; AVX-NEXT:    vmovaps 224(%rdi), %xmm13
2353; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
2354; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
2355; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2356; AVX-NEXT:    vmovaps 384(%rdi), %xmm2
2357; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2358; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
2359; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2360; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
2361; AVX-NEXT:    vmovaps 416(%rdi), %xmm12
2362; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm12[1]
2363; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2364; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2365; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2366; AVX-NEXT:    vmovaps 80(%rdi), %xmm0
2367; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2368; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
2369; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
2370; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
2371; AVX-NEXT:    vmovaps (%rdi), %xmm9
2372; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
2373; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
2374; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2375; AVX-NEXT:    vmovaps 160(%rdi), %xmm2
2376; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2377; AVX-NEXT:    vmovaps 128(%rdi), %xmm1
2378; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2379; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
2380; AVX-NEXT:    vmovaps 192(%rdi), %xmm8
2381; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1]
2382; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2383; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2384; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2385; AVX-NEXT:    vmovaps 288(%rdi), %ymm6
2386; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm6[2,2],ymm5[5,5],ymm6[6,6]
2387; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2388; AVX-NEXT:    vmovaps 256(%rdi), %xmm11
2389; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3]
2390; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
2391; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2392; AVX-NEXT:    vmovaps 384(%rdi), %ymm7
2393; AVX-NEXT:    vmovaps 352(%rdi), %ymm1
2394; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm7[0,1]
2395; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7]
2396; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
2397; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2]
2398; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2399; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
2400; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2401; AVX-NEXT:    vmovaps 64(%rdi), %ymm3
2402; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6]
2403; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2404; AVX-NEXT:    vmovaps 32(%rdi), %xmm10
2405; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3]
2406; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
2407; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7]
2408; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
2409; AVX-NEXT:    vmovaps 128(%rdi), %ymm0
2410; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1]
2411; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7]
2412; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
2413; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2]
2414; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
2415; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
2416; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2417; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3]
2418; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3]
2419; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
2420; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7]
2421; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[2,1],ymm14[2,0],ymm5[6,5],ymm14[6,4]
2422; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
2423; AVX-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm7[0],ymm1[2],ymm7[2]
2424; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
2425; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
2426; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
2427; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7]
2428; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2429; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3]
2430; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
2431; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
2432; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7]
2433; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4]
2434; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7]
2435; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
2436; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
2437; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3]
2438; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
2439; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7]
2440; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2441; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm5[0,0],ymm6[5,4],ymm5[4,4]
2442; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6]
2443; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3]
2444; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
2445; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7]
2446; AVX-NEXT:    vmovaps 416(%rdi), %ymm2
2447; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm2[0,1],ymm7[1,3],ymm2[4,5],ymm7[5,7]
2448; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4]
2449; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
2450; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2451; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4]
2452; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6]
2453; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm9[3]
2454; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
2455; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
2456; AVX-NEXT:    vmovaps 192(%rdi), %ymm6
2457; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm4[1,3],ymm6[4,5],ymm4[5,7]
2458; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm5[2,0],ymm0[4,6],ymm5[6,4]
2459; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
2460; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2461; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
2462; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
2463; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4]
2464; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4]
2465; AVX-NEXT:    vmovaps 320(%rdi), %xmm5
2466; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[0,1,0,1]
2467; AVX-NEXT:    vmovaps 288(%rdi), %xmm8
2468; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3]
2469; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm11[2,3,2,3]
2470; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
2471; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3]
2472; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
2473; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2474; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
2475; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
2476; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4]
2477; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4]
2478; AVX-NEXT:    vmovaps 64(%rdi), %xmm11
2479; AVX-NEXT:    vmovaps 96(%rdi), %xmm9
2480; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1]
2481; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3]
2482; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3]
2483; AVX-NEXT:    vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3]
2484; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3]
2485; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
2486; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
2487; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[2,1],ymm7[3,3],ymm2[6,5],ymm7[7,7]
2488; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2489; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2490; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm14[0],xmm0[1],xmm14[2,3]
2491; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2492; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4]
2493; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm8[3]
2494; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
2495; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
2496; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4]
2497; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
2498; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2]
2499; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm3[4,5,6,7]
2500; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[2,1],ymm4[3,3],ymm6[6,5],ymm4[7,7]
2501; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2502; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2503; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm3[0],xmm1[1],xmm3[2,3]
2504; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2505; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4]
2506; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm11[3]
2507; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2508; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
2509; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4]
2510; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
2511; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2]
2512; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
2513; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2514; AVX-NEXT:    # xmm7 = mem[0,1,0,1]
2515; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
2516; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4]
2517; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
2518; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3]
2519; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
2520; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4]
2521; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3]
2522; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3]
2523; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2524; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,0],ymm7[4,5],ymm2[6,4]
2525; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
2526; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2527; AVX-NEXT:    # xmm5 = mem[0,1,0,1]
2528; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3]
2529; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
2530; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
2531; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3]
2532; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1]
2533; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[0,0],ymm6[7,4],ymm7[4,4]
2534; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3]
2535; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3]
2536; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
2537; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4]
2538; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2539; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2540; AVX-NEXT:    vmovaps %ymm0, (%rsi)
2541; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2542; AVX-NEXT:    vmovaps %ymm6, 32(%rsi)
2543; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2544; AVX-NEXT:    vmovaps %ymm0, (%rdx)
2545; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2546; AVX-NEXT:    vmovaps %ymm0, 32(%rdx)
2547; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2548; AVX-NEXT:    vmovaps %ymm0, (%rcx)
2549; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2550; AVX-NEXT:    vmovaps %ymm0, 32(%rcx)
2551; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2552; AVX-NEXT:    vmovaps %ymm0, (%r8)
2553; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2554; AVX-NEXT:    vmovaps %ymm0, 32(%r8)
2555; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2556; AVX-NEXT:    vmovaps %ymm0, (%r9)
2557; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2558; AVX-NEXT:    vmovaps %ymm0, 32(%r9)
2559; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2560; AVX-NEXT:    vmovaps %ymm4, (%rax)
2561; AVX-NEXT:    vmovaps %ymm15, 32(%rax)
2562; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2563; AVX-NEXT:    vmovaps %ymm5, (%rax)
2564; AVX-NEXT:    vmovaps %ymm2, 32(%rax)
2565; AVX-NEXT:    addq $456, %rsp # imm = 0x1C8
2566; AVX-NEXT:    vzeroupper
2567; AVX-NEXT:    retq
2568;
2569; AVX2-LABEL: load_i32_stride7_vf16:
2570; AVX2:       # %bb.0:
2571; AVX2-NEXT:    subq $264, %rsp # imm = 0x108
2572; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm5
2573; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm9
2574; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm7
2575; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm4
2576; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm0
2577; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm3
2578; AVX2-NEXT:    vmovdqa (%rdi), %ymm10
2579; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm6
2580; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm15
2581; AVX2-NEXT:    vpbroadcastq 80(%rdi), %ymm1
2582; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
2583; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0]
2584; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7]
2585; AVX2-NEXT:    vpermd %ymm8, %ymm2, %ymm8
2586; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7]
2587; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm8
2588; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm11
2589; AVX2-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2590; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1]
2591; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2592; AVX2-NEXT:    vpbroadcastd 196(%rdi), %ymm11
2593; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7]
2594; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
2595; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2596; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2597; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2598; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7]
2599; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
2600; AVX2-NEXT:    vpbroadcastq 304(%rdi), %ymm2
2601; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
2602; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
2603; AVX2-NEXT:    vmovdqa 352(%rdi), %xmm2
2604; AVX2-NEXT:    vmovdqa 384(%rdi), %xmm8
2605; AVX2-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2606; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
2607; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
2608; AVX2-NEXT:    vpbroadcastd 420(%rdi), %ymm8
2609; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7]
2610; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
2611; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2612; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
2613; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
2614; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
2615; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
2616; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2617; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
2618; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
2619; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2620; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
2621; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6]
2622; AVX2-NEXT:    vpermd %ymm2, %ymm12, %ymm2
2623; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7]
2624; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2625; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
2626; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm8
2627; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm3
2628; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm2
2629; AVX2-NEXT:    vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
2630; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0]
2631; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7]
2632; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm11
2633; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
2634; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7]
2635; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7]
2636; AVX2-NEXT:    vpermd %ymm0, %ymm12, %ymm0
2637; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2638; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2639; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm0
2640; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
2641; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
2642; AVX2-NEXT:    vpbroadcastd 8(%rdi), %xmm1
2643; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm12
2644; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3]
2645; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
2646; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2647; AVX2-NEXT:    vpbroadcastd 204(%rdi), %ymm14
2648; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
2649; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2650; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2651; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm0
2652; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
2653; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
2654; AVX2-NEXT:    vpbroadcastd 232(%rdi), %xmm1
2655; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm14
2656; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
2657; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
2658; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
2659; AVX2-NEXT:    vpbroadcastd 428(%rdi), %ymm13
2660; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7]
2661; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2662; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2663; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7]
2664; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
2665; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
2666; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
2667; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
2668; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7]
2669; AVX2-NEXT:    vbroadcastss 208(%rdi), %ymm11
2670; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7]
2671; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2672; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
2673; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
2674; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3]
2675; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
2676; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
2677; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
2678; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7]
2679; AVX2-NEXT:    vbroadcastss 432(%rdi), %ymm4
2680; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
2681; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2682; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2683; AVX2-NEXT:    vpbroadcastd 100(%rdi), %xmm0
2684; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm1
2685; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2686; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0]
2687; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7]
2688; AVX2-NEXT:    vmovdqa %ymm6, %ymm15
2689; AVX2-NEXT:    vpermd %ymm5, %ymm4, %ymm5
2690; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3]
2691; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
2692; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm11
2693; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7]
2694; AVX2-NEXT:    vpbroadcastd 212(%rdi), %ymm12
2695; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
2696; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7]
2697; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
2698; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2699; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7]
2700; AVX2-NEXT:    vpermd %ymm5, %ymm4, %ymm4
2701; AVX2-NEXT:    vpbroadcastd 324(%rdi), %xmm5
2702; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm13
2703; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3]
2704; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
2705; AVX2-NEXT:    vpermd %ymm7, %ymm0, %ymm5
2706; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7]
2707; AVX2-NEXT:    vpbroadcastd 436(%rdi), %ymm11
2708; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7]
2709; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7]
2710; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2711; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
2712; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3]
2713; AVX2-NEXT:    vpbroadcastd 216(%rdi), %ymm3
2714; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
2715; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm3
2716; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
2717; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2718; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[1,0,2,3,5,4,6,7]
2719; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
2720; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
2721; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2722; AVX2-NEXT:    vmovdqa 320(%rdi), %xmm8
2723; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3]
2724; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
2725; AVX2-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
2726; AVX2-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
2727; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
2728; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
2729; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
2730; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
2731; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
2732; AVX2-NEXT:    vpbroadcastd 440(%rdi), %ymm5
2733; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
2734; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
2735; AVX2-NEXT:    vpbroadcastd 136(%rdi), %xmm4
2736; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
2737; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
2738; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
2739; AVX2-NEXT:    vpermd 192(%rdi), %ymm0, %ymm5
2740; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
2741; AVX2-NEXT:    vpbroadcastd 80(%rdi), %ymm5
2742; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
2743; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7]
2744; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
2745; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
2746; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
2747; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
2748; AVX2-NEXT:    vpbroadcastd 360(%rdi), %xmm4
2749; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
2750; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
2751; AVX2-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
2752; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
2753; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
2754; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm4
2755; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
2756; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
2757; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
2758; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
2759; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
2760; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2761; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2762; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
2763; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2764; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
2765; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2766; AVX2-NEXT:    vmovaps %ymm4, 32(%rdx)
2767; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2768; AVX2-NEXT:    vmovaps %ymm4, (%rdx)
2769; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2770; AVX2-NEXT:    vmovaps %ymm4, 32(%rcx)
2771; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2772; AVX2-NEXT:    vmovaps %ymm4, (%rcx)
2773; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2774; AVX2-NEXT:    vmovaps %ymm4, 32(%r8)
2775; AVX2-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
2776; AVX2-NEXT:    vmovaps %ymm4, (%r8)
2777; AVX2-NEXT:    vmovdqa %ymm11, 32(%r9)
2778; AVX2-NEXT:    vmovdqa %ymm12, (%r9)
2779; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2780; AVX2-NEXT:    vmovdqa %ymm2, 32(%rax)
2781; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
2782; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2783; AVX2-NEXT:    vmovdqa %ymm0, 32(%rax)
2784; AVX2-NEXT:    vmovdqa %ymm3, (%rax)
2785; AVX2-NEXT:    addq $264, %rsp # imm = 0x108
2786; AVX2-NEXT:    vzeroupper
2787; AVX2-NEXT:    retq
2788;
2789; AVX2-FP-LABEL: load_i32_stride7_vf16:
2790; AVX2-FP:       # %bb.0:
2791; AVX2-FP-NEXT:    subq $264, %rsp # imm = 0x108
2792; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm5
2793; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm9
2794; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm7
2795; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm4
2796; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm0
2797; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm3
2798; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm10
2799; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm6
2800; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm15
2801; AVX2-FP-NEXT:    vpbroadcastq 80(%rdi), %ymm1
2802; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
2803; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0]
2804; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7]
2805; AVX2-FP-NEXT:    vpermd %ymm8, %ymm2, %ymm8
2806; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7]
2807; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm8
2808; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm11
2809; AVX2-FP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2810; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1]
2811; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2812; AVX2-FP-NEXT:    vpbroadcastd 196(%rdi), %ymm11
2813; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7]
2814; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
2815; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2816; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2817; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2818; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7]
2819; AVX2-FP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
2820; AVX2-FP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
2821; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
2822; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
2823; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %xmm2
2824; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %xmm8
2825; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2826; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
2827; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
2828; AVX2-FP-NEXT:    vpbroadcastd 420(%rdi), %ymm8
2829; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7]
2830; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
2831; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2832; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
2833; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
2834; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
2835; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
2836; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2837; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
2838; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
2839; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2840; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
2841; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6]
2842; AVX2-FP-NEXT:    vpermd %ymm2, %ymm12, %ymm2
2843; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7]
2844; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2845; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
2846; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm8
2847; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm3
2848; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm2
2849; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
2850; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0]
2851; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7]
2852; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm11
2853; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
2854; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7]
2855; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7]
2856; AVX2-FP-NEXT:    vpermd %ymm0, %ymm12, %ymm0
2857; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2858; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2859; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm0
2860; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
2861; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
2862; AVX2-FP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
2863; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm12
2864; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3]
2865; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
2866; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2867; AVX2-FP-NEXT:    vpbroadcastd 204(%rdi), %ymm14
2868; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
2869; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2870; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2871; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm0
2872; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
2873; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
2874; AVX2-FP-NEXT:    vpbroadcastd 232(%rdi), %xmm1
2875; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm14
2876; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
2877; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
2878; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
2879; AVX2-FP-NEXT:    vpbroadcastd 428(%rdi), %ymm13
2880; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7]
2881; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2882; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2883; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7]
2884; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
2885; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
2886; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
2887; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
2888; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7]
2889; AVX2-FP-NEXT:    vbroadcastss 208(%rdi), %ymm11
2890; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7]
2891; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2892; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
2893; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
2894; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3]
2895; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
2896; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
2897; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
2898; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7]
2899; AVX2-FP-NEXT:    vbroadcastss 432(%rdi), %ymm4
2900; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
2901; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2902; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2903; AVX2-FP-NEXT:    vpbroadcastd 100(%rdi), %xmm0
2904; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm1
2905; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2906; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0]
2907; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7]
2908; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm15
2909; AVX2-FP-NEXT:    vpermd %ymm5, %ymm4, %ymm5
2910; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3]
2911; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
2912; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm11
2913; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7]
2914; AVX2-FP-NEXT:    vpbroadcastd 212(%rdi), %ymm12
2915; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
2916; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7]
2917; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
2918; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2919; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7]
2920; AVX2-FP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
2921; AVX2-FP-NEXT:    vpbroadcastd 324(%rdi), %xmm5
2922; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm13
2923; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3]
2924; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
2925; AVX2-FP-NEXT:    vpermd %ymm7, %ymm0, %ymm5
2926; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7]
2927; AVX2-FP-NEXT:    vpbroadcastd 436(%rdi), %ymm11
2928; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7]
2929; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7]
2930; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2931; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
2932; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3]
2933; AVX2-FP-NEXT:    vpbroadcastd 216(%rdi), %ymm3
2934; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
2935; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm3
2936; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
2937; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2938; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[1,0,2,3,5,4,6,7]
2939; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm4
2940; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
2941; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2942; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %xmm8
2943; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3]
2944; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
2945; AVX2-FP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
2946; AVX2-FP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
2947; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm4
2948; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
2949; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
2950; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
2951; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
2952; AVX2-FP-NEXT:    vpbroadcastd 440(%rdi), %ymm5
2953; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
2954; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
2955; AVX2-FP-NEXT:    vpbroadcastd 136(%rdi), %xmm4
2956; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
2957; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
2958; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
2959; AVX2-FP-NEXT:    vpermd 192(%rdi), %ymm0, %ymm5
2960; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
2961; AVX2-FP-NEXT:    vpbroadcastd 80(%rdi), %ymm5
2962; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
2963; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7]
2964; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
2965; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
2966; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
2967; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
2968; AVX2-FP-NEXT:    vpbroadcastd 360(%rdi), %xmm4
2969; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
2970; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
2971; AVX2-FP-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
2972; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
2973; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
2974; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm4
2975; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
2976; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
2977; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
2978; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
2979; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
2980; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2981; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2982; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
2983; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2984; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
2985; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2986; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
2987; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2988; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
2989; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2990; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
2991; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2992; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
2993; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2994; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%r8)
2995; AVX2-FP-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
2996; AVX2-FP-NEXT:    vmovaps %ymm4, (%r8)
2997; AVX2-FP-NEXT:    vmovdqa %ymm11, 32(%r9)
2998; AVX2-FP-NEXT:    vmovdqa %ymm12, (%r9)
2999; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3000; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%rax)
3001; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
3002; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3003; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%rax)
3004; AVX2-FP-NEXT:    vmovdqa %ymm3, (%rax)
3005; AVX2-FP-NEXT:    addq $264, %rsp # imm = 0x108
3006; AVX2-FP-NEXT:    vzeroupper
3007; AVX2-FP-NEXT:    retq
3008;
3009; AVX2-FCP-LABEL: load_i32_stride7_vf16:
3010; AVX2-FCP:       # %bb.0:
3011; AVX2-FCP-NEXT:    subq $264, %rsp # imm = 0x108
3012; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm5
3013; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm9
3014; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm7
3015; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm4
3016; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
3017; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm3
3018; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm10
3019; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
3020; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm15
3021; AVX2-FCP-NEXT:    vpbroadcastq 80(%rdi), %ymm1
3022; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
3023; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0]
3024; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7]
3025; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm2, %ymm8
3026; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7]
3027; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm8
3028; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm11
3029; AVX2-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3030; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1]
3031; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
3032; AVX2-FCP-NEXT:    vpbroadcastd 196(%rdi), %ymm11
3033; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7]
3034; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
3035; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3036; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3037; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3038; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7]
3039; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3040; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
3041; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
3042; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
3043; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
3044; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm8
3045; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3046; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
3047; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3048; AVX2-FCP-NEXT:    vpbroadcastd 420(%rdi), %ymm8
3049; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7]
3050; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
3051; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3052; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
3053; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3054; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
3055; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
3056; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
3057; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
3058; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
3059; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3060; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
3061; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6]
3062; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm12, %ymm2
3063; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7]
3064; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3065; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
3066; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm8
3067; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
3068; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
3069; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
3070; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0]
3071; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7]
3072; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
3073; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
3074; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7]
3075; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7]
3076; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm12, %ymm0
3077; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3078; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3079; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
3080; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
3081; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
3082; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
3083; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
3084; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3]
3085; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
3086; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
3087; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm14
3088; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
3089; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3090; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3091; AVX2-FCP-NEXT:    vmovdqa 304(%rdi), %xmm0
3092; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
3093; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
3094; AVX2-FCP-NEXT:    vpbroadcastd 232(%rdi), %xmm1
3095; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm14
3096; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
3097; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
3098; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
3099; AVX2-FCP-NEXT:    vpbroadcastd 428(%rdi), %ymm13
3100; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7]
3101; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3102; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3103; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7]
3104; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
3105; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
3106; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
3107; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
3108; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7]
3109; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm11
3110; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7]
3111; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3112; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
3113; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
3114; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3]
3115; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
3116; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
3117; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
3118; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7]
3119; AVX2-FCP-NEXT:    vbroadcastss 432(%rdi), %ymm4
3120; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
3121; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3122; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3123; AVX2-FCP-NEXT:    vpbroadcastd 100(%rdi), %xmm0
3124; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
3125; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
3126; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0]
3127; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7]
3128; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm15
3129; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm5
3130; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3]
3131; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
3132; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm11
3133; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7]
3134; AVX2-FCP-NEXT:    vpbroadcastd 212(%rdi), %ymm12
3135; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
3136; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7]
3137; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
3138; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
3139; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7]
3140; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
3141; AVX2-FCP-NEXT:    vpbroadcastd 324(%rdi), %xmm5
3142; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %xmm13
3143; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3]
3144; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
3145; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm0, %ymm5
3146; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7]
3147; AVX2-FCP-NEXT:    vpbroadcastd 436(%rdi), %ymm11
3148; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7]
3149; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7]
3150; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
3151; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,0,3,3,1,0,7,7]
3152; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
3153; AVX2-FCP-NEXT:    vpbroadcastd 216(%rdi), %ymm4
3154; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
3155; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
3156; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
3157; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
3158; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,0,2,3,5,4,6,7]
3159; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
3160; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
3161; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3162; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %xmm8
3163; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3]
3164; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
3165; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
3166; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
3167; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
3168; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
3169; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
3170; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm3, %ymm3
3171; AVX2-FCP-NEXT:    vpbroadcastd 440(%rdi), %ymm5
3172; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
3173; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
3174; AVX2-FCP-NEXT:    vpbroadcastd 136(%rdi), %xmm3
3175; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
3176; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
3177; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
3178; AVX2-FCP-NEXT:    vpermd 192(%rdi), %ymm0, %ymm5
3179; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
3180; AVX2-FCP-NEXT:    vpbroadcastd 80(%rdi), %ymm5
3181; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
3182; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7]
3183; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
3184; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
3185; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
3186; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3187; AVX2-FCP-NEXT:    vpbroadcastd 360(%rdi), %xmm4
3188; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
3189; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
3190; AVX2-FCP-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
3191; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3192; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
3193; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm4
3194; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
3195; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
3196; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
3197; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
3198; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
3199; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
3200; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3201; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rsi)
3202; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3203; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
3204; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3205; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rdx)
3206; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3207; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
3208; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3209; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rcx)
3210; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3211; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
3212; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3213; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%r8)
3214; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
3215; AVX2-FCP-NEXT:    vmovaps %ymm4, (%r8)
3216; AVX2-FCP-NEXT:    vmovdqa %ymm11, 32(%r9)
3217; AVX2-FCP-NEXT:    vmovdqa %ymm12, (%r9)
3218; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3219; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%rax)
3220; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
3221; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3222; AVX2-FCP-NEXT:    vmovdqa %ymm0, 32(%rax)
3223; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%rax)
3224; AVX2-FCP-NEXT:    addq $264, %rsp # imm = 0x108
3225; AVX2-FCP-NEXT:    vzeroupper
3226; AVX2-FCP-NEXT:    retq
3227;
3228; AVX512-LABEL: load_i32_stride7_vf16:
3229; AVX512:       # %bb.0:
3230; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3231; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3232; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm1
3233; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm5
3234; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm4
3235; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
3236; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm2
3237; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm8
3238; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm7
3239; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
3240; AVX512-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
3241; AVX512-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
3242; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0]
3243; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
3244; AVX512-NEXT:    movw $992, %di # imm = 0x3E0
3245; AVX512-NEXT:    kmovw %edi, %k1
3246; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
3247; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
3248; AVX512-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3249; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3250; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
3251; AVX512-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3252; AVX512-NEXT:    vpermi2d %zmm1, %zmm6, %zmm9
3253; AVX512-NEXT:    movb $-32, %dil
3254; AVX512-NEXT:    kmovw %edi, %k1
3255; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
3256; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
3257; AVX512-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3258; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm9
3259; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
3260; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3261; AVX512-NEXT:    vpermi2d %zmm7, %zmm8, %zmm10
3262; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
3263; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3264; AVX512-NEXT:    vpermi2d %zmm7, %zmm8, %zmm11
3265; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
3266; AVX512-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
3267; AVX512-NEXT:    vpermi2d %zmm7, %zmm8, %zmm12
3268; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
3269; AVX512-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
3270; AVX512-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
3271; AVX512-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
3272; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0]
3273; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3274; AVX512-NEXT:    movw $480, %di # imm = 0x1E0
3275; AVX512-NEXT:    kmovw %edi, %k2
3276; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm14 {%k2}
3277; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
3278; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3279; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm8
3280; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
3281; AVX512-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3282; AVX512-NEXT:    vpermi2d %zmm1, %zmm8, %zmm15
3283; AVX512-NEXT:    movw $-512, %di # imm = 0xFE00
3284; AVX512-NEXT:    kmovw %edi, %k1
3285; AVX512-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
3286; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
3287; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3288; AVX512-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3289; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
3290; AVX512-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
3291; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
3292; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
3293; AVX512-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3294; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
3295; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
3296; AVX512-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3297; AVX512-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
3298; AVX512-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
3299; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0]
3300; AVX512-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
3301; AVX512-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
3302; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
3303; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3304; AVX512-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3305; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
3306; AVX512-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3307; AVX512-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3308; AVX512-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
3309; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
3310; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3311; AVX512-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3312; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
3313; AVX512-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3314; AVX512-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3315; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25]
3316; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3317; AVX512-NEXT:    vinserti32x4 $0, %xmm10, %zmm11, %zmm10
3318; AVX512-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
3319; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3320; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
3321; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3322; AVX512-NEXT:    vpermi2d %zmm1, %zmm6, %zmm11
3323; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26]
3324; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm6
3325; AVX512-NEXT:    vinserti32x4 $0, %xmm6, %zmm12, %zmm6
3326; AVX512-NEXT:    vmovdqa32 %zmm11, %zmm6 {%k1}
3327; AVX512-NEXT:    vpermt2d %zmm5, %zmm8, %zmm4
3328; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
3329; AVX512-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
3330; AVX512-NEXT:    vpermi2d %zmm1, %zmm4, %zmm5
3331; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27]
3332; AVX512-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
3333; AVX512-NEXT:    vinserti32x4 $0, %xmm1, %zmm13, %zmm0
3334; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
3335; AVX512-NEXT:    vmovdqa64 %zmm3, (%rsi)
3336; AVX512-NEXT:    vmovdqa64 %zmm14, (%rdx)
3337; AVX512-NEXT:    vmovdqa64 %zmm7, (%rcx)
3338; AVX512-NEXT:    vmovdqa64 %zmm9, (%r8)
3339; AVX512-NEXT:    vmovdqa64 %zmm10, (%r9)
3340; AVX512-NEXT:    vmovdqa64 %zmm6, (%r10)
3341; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
3342; AVX512-NEXT:    vzeroupper
3343; AVX512-NEXT:    retq
3344;
3345; AVX512-FCP-LABEL: load_i32_stride7_vf16:
3346; AVX512-FCP:       # %bb.0:
3347; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3348; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3349; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
3350; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
3351; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm4
3352; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
3353; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
3354; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
3355; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm7
3356; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
3357; AVX512-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
3358; AVX512-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
3359; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0]
3360; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
3361; AVX512-FCP-NEXT:    movw $992, %di # imm = 0x3E0
3362; AVX512-FCP-NEXT:    kmovw %edi, %k1
3363; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
3364; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
3365; AVX512-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3366; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3367; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
3368; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3369; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm9
3370; AVX512-FCP-NEXT:    movb $-32, %dil
3371; AVX512-FCP-NEXT:    kmovw %edi, %k1
3372; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
3373; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
3374; AVX512-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3375; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm9
3376; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
3377; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3378; AVX512-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm10
3379; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
3380; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3381; AVX512-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm11
3382; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
3383; AVX512-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
3384; AVX512-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm12
3385; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
3386; AVX512-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
3387; AVX512-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
3388; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
3389; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0]
3390; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3391; AVX512-FCP-NEXT:    movw $480, %di # imm = 0x1E0
3392; AVX512-FCP-NEXT:    kmovw %edi, %k2
3393; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm14 {%k2}
3394; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
3395; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3396; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm8
3397; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
3398; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3399; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm8, %zmm15
3400; AVX512-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
3401; AVX512-FCP-NEXT:    kmovw %edi, %k1
3402; AVX512-FCP-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
3403; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
3404; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3405; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3406; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
3407; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
3408; AVX512-FCP-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
3409; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
3410; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3411; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
3412; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
3413; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3414; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
3415; AVX512-FCP-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
3416; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0]
3417; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
3418; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
3419; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
3420; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3421; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3422; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
3423; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3424; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3425; AVX512-FCP-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
3426; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
3427; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3428; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3429; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
3430; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3431; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3432; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25]
3433; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3434; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm11, %zmm10
3435; AVX512-FCP-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
3436; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3437; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
3438; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3439; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm11
3440; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26]
3441; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm6
3442; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm12, %zmm6
3443; AVX512-FCP-NEXT:    vmovdqa32 %zmm11, %zmm6 {%k1}
3444; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm4
3445; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
3446; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
3447; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm4, %zmm5
3448; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27]
3449; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
3450; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm13, %zmm0
3451; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
3452; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
3453; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
3454; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
3455; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, (%r8)
3456; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, (%r9)
3457; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%r10)
3458; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
3459; AVX512-FCP-NEXT:    vzeroupper
3460; AVX512-FCP-NEXT:    retq
3461;
3462; AVX512DQ-LABEL: load_i32_stride7_vf16:
3463; AVX512DQ:       # %bb.0:
3464; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3465; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3466; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm1
3467; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm5
3468; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm4
3469; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
3470; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm2
3471; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm8
3472; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm7
3473; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
3474; AVX512DQ-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
3475; AVX512DQ-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
3476; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0]
3477; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
3478; AVX512DQ-NEXT:    movw $992, %di # imm = 0x3E0
3479; AVX512DQ-NEXT:    kmovw %edi, %k1
3480; AVX512DQ-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
3481; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
3482; AVX512DQ-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3483; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3484; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
3485; AVX512DQ-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3486; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm6, %zmm9
3487; AVX512DQ-NEXT:    movb $-32, %dil
3488; AVX512DQ-NEXT:    kmovw %edi, %k1
3489; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
3490; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
3491; AVX512DQ-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3492; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm9
3493; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
3494; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3495; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm8, %zmm10
3496; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
3497; AVX512DQ-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3498; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm8, %zmm11
3499; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
3500; AVX512DQ-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
3501; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm8, %zmm12
3502; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
3503; AVX512DQ-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
3504; AVX512DQ-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
3505; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
3506; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0]
3507; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3508; AVX512DQ-NEXT:    movw $480, %di # imm = 0x1E0
3509; AVX512DQ-NEXT:    kmovw %edi, %k2
3510; AVX512DQ-NEXT:    vmovdqa32 %zmm8, %zmm14 {%k2}
3511; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
3512; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3513; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm8
3514; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
3515; AVX512DQ-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3516; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm8, %zmm15
3517; AVX512DQ-NEXT:    movw $-512, %di # imm = 0xFE00
3518; AVX512DQ-NEXT:    kmovw %edi, %k1
3519; AVX512DQ-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
3520; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
3521; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3522; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3523; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
3524; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
3525; AVX512DQ-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
3526; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
3527; AVX512DQ-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3528; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
3529; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
3530; AVX512DQ-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3531; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
3532; AVX512DQ-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
3533; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0]
3534; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
3535; AVX512DQ-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
3536; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
3537; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3538; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3539; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
3540; AVX512DQ-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3541; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3542; AVX512DQ-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
3543; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
3544; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3545; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3546; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
3547; AVX512DQ-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3548; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3549; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25]
3550; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3551; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm10, %zmm11, %zmm10
3552; AVX512DQ-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
3553; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3554; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
3555; AVX512DQ-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3556; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm6, %zmm11
3557; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26]
3558; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm6
3559; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm6, %zmm12, %zmm6
3560; AVX512DQ-NEXT:    vmovdqa32 %zmm11, %zmm6 {%k1}
3561; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm8, %zmm4
3562; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
3563; AVX512DQ-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
3564; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm4, %zmm5
3565; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27]
3566; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
3567; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm1, %zmm13, %zmm0
3568; AVX512DQ-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
3569; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rsi)
3570; AVX512DQ-NEXT:    vmovdqa64 %zmm14, (%rdx)
3571; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rcx)
3572; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%r8)
3573; AVX512DQ-NEXT:    vmovdqa64 %zmm10, (%r9)
3574; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%r10)
3575; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
3576; AVX512DQ-NEXT:    vzeroupper
3577; AVX512DQ-NEXT:    retq
3578;
3579; AVX512DQ-FCP-LABEL: load_i32_stride7_vf16:
3580; AVX512DQ-FCP:       # %bb.0:
3581; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3582; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3583; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
3584; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
3585; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm4
3586; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
3587; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
3588; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
3589; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm7
3590; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
3591; AVX512DQ-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
3592; AVX512DQ-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
3593; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0]
3594; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
3595; AVX512DQ-FCP-NEXT:    movw $992, %di # imm = 0x3E0
3596; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
3597; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
3598; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
3599; AVX512DQ-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3600; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3601; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
3602; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3603; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm9
3604; AVX512DQ-FCP-NEXT:    movb $-32, %dil
3605; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
3606; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
3607; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
3608; AVX512DQ-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3609; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm9
3610; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
3611; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3612; AVX512DQ-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm10
3613; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
3614; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3615; AVX512DQ-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm11
3616; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
3617; AVX512DQ-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
3618; AVX512DQ-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm12
3619; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
3620; AVX512DQ-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
3621; AVX512DQ-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
3622; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
3623; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0]
3624; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3625; AVX512DQ-FCP-NEXT:    movw $480, %di # imm = 0x1E0
3626; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
3627; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm14 {%k2}
3628; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
3629; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3630; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm8
3631; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
3632; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3633; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm8, %zmm15
3634; AVX512DQ-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
3635; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
3636; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
3637; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
3638; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3639; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3640; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
3641; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
3642; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
3643; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
3644; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3645; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
3646; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
3647; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3648; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
3649; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
3650; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0]
3651; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
3652; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
3653; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
3654; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3655; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3656; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
3657; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3658; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3659; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
3660; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
3661; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3662; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3663; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
3664; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3665; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3666; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25]
3667; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3668; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm11, %zmm10
3669; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
3670; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3671; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
3672; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3673; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm11
3674; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26]
3675; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm6
3676; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm12, %zmm6
3677; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm11, %zmm6 {%k1}
3678; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm4
3679; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
3680; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
3681; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm4, %zmm5
3682; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27]
3683; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
3684; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm13, %zmm0
3685; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
3686; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
3687; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
3688; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
3689; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%r8)
3690; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%r9)
3691; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%r10)
3692; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
3693; AVX512DQ-FCP-NEXT:    vzeroupper
3694; AVX512DQ-FCP-NEXT:    retq
3695;
3696; AVX512BW-LABEL: load_i32_stride7_vf16:
3697; AVX512BW:       # %bb.0:
3698; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3699; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3700; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm1
3701; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm5
3702; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm4
3703; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3704; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
3705; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
3706; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm7
3707; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
3708; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
3709; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
3710; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0]
3711; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
3712; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
3713; AVX512BW-NEXT:    kmovd %edi, %k1
3714; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
3715; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
3716; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3717; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3718; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
3719; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3720; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm6, %zmm9
3721; AVX512BW-NEXT:    movb $-32, %dil
3722; AVX512BW-NEXT:    kmovd %edi, %k1
3723; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
3724; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
3725; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3726; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
3727; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
3728; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3729; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm8, %zmm10
3730; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
3731; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3732; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm8, %zmm11
3733; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
3734; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
3735; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm8, %zmm12
3736; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
3737; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
3738; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
3739; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
3740; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0]
3741; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3742; AVX512BW-NEXT:    movw $480, %di # imm = 0x1E0
3743; AVX512BW-NEXT:    kmovd %edi, %k2
3744; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm14 {%k2}
3745; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
3746; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3747; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm8
3748; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
3749; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3750; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm8, %zmm15
3751; AVX512BW-NEXT:    movw $-512, %di # imm = 0xFE00
3752; AVX512BW-NEXT:    kmovd %edi, %k1
3753; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
3754; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
3755; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3756; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3757; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
3758; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
3759; AVX512BW-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
3760; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
3761; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3762; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
3763; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
3764; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3765; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
3766; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
3767; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0]
3768; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
3769; AVX512BW-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
3770; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
3771; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3772; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3773; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
3774; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3775; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3776; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
3777; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
3778; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3779; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3780; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
3781; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3782; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3783; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25]
3784; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3785; AVX512BW-NEXT:    vinserti32x4 $0, %xmm10, %zmm11, %zmm10
3786; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
3787; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3788; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
3789; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3790; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm6, %zmm11
3791; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26]
3792; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm6
3793; AVX512BW-NEXT:    vinserti32x4 $0, %xmm6, %zmm12, %zmm6
3794; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm6 {%k1}
3795; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm8, %zmm4
3796; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
3797; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
3798; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm4, %zmm5
3799; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27]
3800; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
3801; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm13, %zmm0
3802; AVX512BW-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
3803; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rsi)
3804; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
3805; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rcx)
3806; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%r8)
3807; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%r9)
3808; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%r10)
3809; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
3810; AVX512BW-NEXT:    vzeroupper
3811; AVX512BW-NEXT:    retq
3812;
3813; AVX512BW-FCP-LABEL: load_i32_stride7_vf16:
3814; AVX512BW-FCP:       # %bb.0:
3815; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3816; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3817; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
3818; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
3819; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm4
3820; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
3821; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
3822; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
3823; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm7
3824; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
3825; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
3826; AVX512BW-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
3827; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0]
3828; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
3829; AVX512BW-FCP-NEXT:    movw $992, %di # imm = 0x3E0
3830; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
3831; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
3832; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
3833; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3834; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3835; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
3836; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3837; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm9
3838; AVX512BW-FCP-NEXT:    movb $-32, %dil
3839; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
3840; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
3841; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
3842; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3843; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm9
3844; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
3845; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3846; AVX512BW-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm10
3847; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
3848; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3849; AVX512BW-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm11
3850; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
3851; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
3852; AVX512BW-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm12
3853; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
3854; AVX512BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
3855; AVX512BW-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
3856; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
3857; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0]
3858; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3859; AVX512BW-FCP-NEXT:    movw $480, %di # imm = 0x1E0
3860; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
3861; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm14 {%k2}
3862; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
3863; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3864; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm8
3865; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
3866; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3867; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm8, %zmm15
3868; AVX512BW-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
3869; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
3870; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
3871; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
3872; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3873; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3874; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
3875; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
3876; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
3877; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
3878; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3879; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
3880; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
3881; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3882; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
3883; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
3884; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0]
3885; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
3886; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
3887; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
3888; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3889; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3890; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
3891; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3892; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3893; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
3894; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
3895; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
3896; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
3897; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
3898; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3899; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
3900; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25]
3901; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
3902; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm11, %zmm10
3903; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
3904; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3905; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
3906; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3907; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm11
3908; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26]
3909; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm6
3910; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm12, %zmm6
3911; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm11, %zmm6 {%k1}
3912; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm4
3913; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
3914; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
3915; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm4, %zmm5
3916; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27]
3917; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
3918; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm13, %zmm0
3919; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
3920; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
3921; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
3922; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
3923; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%r8)
3924; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%r9)
3925; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%r10)
3926; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
3927; AVX512BW-FCP-NEXT:    vzeroupper
3928; AVX512BW-FCP-NEXT:    retq
3929;
3930; AVX512DQ-BW-LABEL: load_i32_stride7_vf16:
3931; AVX512DQ-BW:       # %bb.0:
3932; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3933; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3934; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm1
3935; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm5
3936; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm4
3937; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3938; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
3939; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
3940; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm7
3941; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
3942; AVX512DQ-BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
3943; AVX512DQ-BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
3944; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0]
3945; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
3946; AVX512DQ-BW-NEXT:    movw $992, %di # imm = 0x3E0
3947; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
3948; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
3949; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
3950; AVX512DQ-BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3951; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
3952; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
3953; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3954; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm6, %zmm9
3955; AVX512DQ-BW-NEXT:    movb $-32, %dil
3956; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
3957; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
3958; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
3959; AVX512DQ-BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3960; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm9
3961; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
3962; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3963; AVX512DQ-BW-NEXT:    vpermi2d %zmm7, %zmm8, %zmm10
3964; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
3965; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
3966; AVX512DQ-BW-NEXT:    vpermi2d %zmm7, %zmm8, %zmm11
3967; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
3968; AVX512DQ-BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
3969; AVX512DQ-BW-NEXT:    vpermi2d %zmm7, %zmm8, %zmm12
3970; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
3971; AVX512DQ-BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
3972; AVX512DQ-BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
3973; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
3974; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0]
3975; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
3976; AVX512DQ-BW-NEXT:    movw $480, %di # imm = 0x1E0
3977; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
3978; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm14 {%k2}
3979; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
3980; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
3981; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm8
3982; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
3983; AVX512DQ-BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3984; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm8, %zmm15
3985; AVX512DQ-BW-NEXT:    movw $-512, %di # imm = 0xFE00
3986; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
3987; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
3988; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
3989; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3990; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
3991; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
3992; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
3993; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
3994; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
3995; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
3996; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
3997; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
3998; AVX512DQ-BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
3999; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
4000; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
4001; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0]
4002; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
4003; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
4004; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
4005; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
4006; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
4007; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
4008; AVX512DQ-BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
4009; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
4010; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
4011; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
4012; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
4013; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
4014; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
4015; AVX512DQ-BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
4016; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
4017; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25]
4018; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
4019; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm10, %zmm11, %zmm10
4020; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
4021; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
4022; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
4023; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
4024; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm6, %zmm11
4025; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26]
4026; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm6
4027; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm6, %zmm12, %zmm6
4028; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm11, %zmm6 {%k1}
4029; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm8, %zmm4
4030; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
4031; AVX512DQ-BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
4032; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm4, %zmm5
4033; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27]
4034; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
4035; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm13, %zmm0
4036; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
4037; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%rsi)
4038; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
4039; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rcx)
4040; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%r8)
4041; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%r9)
4042; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, (%r10)
4043; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rax)
4044; AVX512DQ-BW-NEXT:    vzeroupper
4045; AVX512DQ-BW-NEXT:    retq
4046;
4047; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf16:
4048; AVX512DQ-BW-FCP:       # %bb.0:
4049; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4050; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4051; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
4052; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
4053; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm4
4054; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
4055; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
4056; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
4057; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm7
4058; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
4059; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
4060; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
4061; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0]
4062; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
4063; AVX512DQ-BW-FCP-NEXT:    movw $992, %di # imm = 0x3E0
4064; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
4065; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
4066; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
4067; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4068; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
4069; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
4070; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
4071; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm9
4072; AVX512DQ-BW-FCP-NEXT:    movb $-32, %dil
4073; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
4074; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
4075; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
4076; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4077; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm9
4078; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
4079; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4080; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm10
4081; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
4082; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
4083; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm11
4084; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
4085; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
4086; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm7, %zmm8, %zmm12
4087; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
4088; AVX512DQ-BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
4089; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
4090; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
4091; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0]
4092; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
4093; AVX512DQ-BW-FCP-NEXT:    movw $480, %di # imm = 0x1E0
4094; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
4095; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm14 {%k2}
4096; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
4097; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4098; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm8
4099; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
4100; AVX512DQ-BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
4101; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm8, %zmm15
4102; AVX512DQ-BW-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
4103; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
4104; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
4105; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
4106; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4107; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
4108; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0]
4109; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
4110; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
4111; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
4112; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
4113; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
4114; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
4115; AVX512DQ-BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
4116; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
4117; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
4118; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0]
4119; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
4120; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
4121; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
4122; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
4123; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
4124; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
4125; AVX512DQ-BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
4126; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
4127; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
4128; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
4129; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
4130; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm10
4131; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
4132; AVX512DQ-BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
4133; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm15
4134; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25]
4135; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
4136; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm11, %zmm10
4137; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
4138; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
4139; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
4140; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
4141; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm11
4142; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26]
4143; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm6
4144; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm12, %zmm6
4145; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm11, %zmm6 {%k1}
4146; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm4
4147; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
4148; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
4149; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm4, %zmm5
4150; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27]
4151; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
4152; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm13, %zmm0
4153; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
4154; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
4155; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
4156; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
4157; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%r8)
4158; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%r9)
4159; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%r10)
4160; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
4161; AVX512DQ-BW-FCP-NEXT:    vzeroupper
4162; AVX512DQ-BW-FCP-NEXT:    retq
4163  %wide.vec = load <112 x i32>, ptr %in.vec, align 64
4164  %strided.vec0 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105>
4165  %strided.vec1 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106>
4166  %strided.vec2 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107>
4167  %strided.vec3 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108>
4168  %strided.vec4 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109>
4169  %strided.vec5 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110>
4170  %strided.vec6 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111>
4171  store <16 x i32> %strided.vec0, ptr %out.vec0, align 64
4172  store <16 x i32> %strided.vec1, ptr %out.vec1, align 64
4173  store <16 x i32> %strided.vec2, ptr %out.vec2, align 64
4174  store <16 x i32> %strided.vec3, ptr %out.vec3, align 64
4175  store <16 x i32> %strided.vec4, ptr %out.vec4, align 64
4176  store <16 x i32> %strided.vec5, ptr %out.vec5, align 64
4177  store <16 x i32> %strided.vec6, ptr %out.vec6, align 64
4178  ret void
4179}
4180
4181define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
4182; SSE-LABEL: load_i32_stride7_vf32:
4183; SSE:       # %bb.0:
4184; SSE-NEXT:    subq $1160, %rsp # imm = 0x488
4185; SSE-NEXT:    movdqa 80(%rdi), %xmm8
4186; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4187; SSE-NEXT:    movdqa (%rdi), %xmm13
4188; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4189; SSE-NEXT:    movdqa 16(%rdi), %xmm6
4190; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4191; SSE-NEXT:    movdqa 48(%rdi), %xmm5
4192; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4193; SSE-NEXT:    movdqa 640(%rdi), %xmm3
4194; SSE-NEXT:    movdqa 608(%rdi), %xmm4
4195; SSE-NEXT:    movdqa 560(%rdi), %xmm10
4196; SSE-NEXT:    movdqa 576(%rdi), %xmm1
4197; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4198; SSE-NEXT:    movdqa 192(%rdi), %xmm14
4199; SSE-NEXT:    movdqa 160(%rdi), %xmm12
4200; SSE-NEXT:    movdqa 112(%rdi), %xmm2
4201; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4202; SSE-NEXT:    movdqa 128(%rdi), %xmm0
4203; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4204; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
4205; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4206; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
4207; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4208; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
4209; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4210; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4211; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
4212; SSE-NEXT:    movdqa %xmm10, %xmm2
4213; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4214; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4215; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3]
4216; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4217; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
4218; SSE-NEXT:    movdqa %xmm3, %xmm7
4219; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4220; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4221; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4222; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
4223; SSE-NEXT:    movdqa %xmm13, %xmm2
4224; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4225; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
4226; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
4227; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4228; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4229; SSE-NEXT:    movdqa 448(%rdi), %xmm2
4230; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4231; SSE-NEXT:    movdqa 464(%rdi), %xmm0
4232; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4233; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
4234; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4235; SSE-NEXT:    movdqa 528(%rdi), %xmm9
4236; SSE-NEXT:    movdqa 496(%rdi), %xmm13
4237; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
4238; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4239; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
4240; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4241; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4242; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4243; SSE-NEXT:    movdqa 336(%rdi), %xmm2
4244; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4245; SSE-NEXT:    movdqa 352(%rdi), %xmm0
4246; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4247; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
4248; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4249; SSE-NEXT:    movdqa 416(%rdi), %xmm3
4250; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4251; SSE-NEXT:    movdqa 384(%rdi), %xmm11
4252; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
4253; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4254; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
4255; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4256; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4257; SSE-NEXT:    movdqa 784(%rdi), %xmm2
4258; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4259; SSE-NEXT:    movdqa 800(%rdi), %xmm0
4260; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4261; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
4262; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4263; SSE-NEXT:    movdqa 864(%rdi), %xmm8
4264; SSE-NEXT:    movdqa 832(%rdi), %xmm15
4265; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
4266; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4267; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
4268; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4269; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4270; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4271; SSE-NEXT:    movdqa 224(%rdi), %xmm3
4272; SSE-NEXT:    movdqa %xmm3, (%rsp) # 16-byte Spill
4273; SSE-NEXT:    movdqa 240(%rdi), %xmm0
4274; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4275; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
4276; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
4277; SSE-NEXT:    movdqa 304(%rdi), %xmm1
4278; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4279; SSE-NEXT:    movdqa 272(%rdi), %xmm6
4280; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
4281; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4282; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4283; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
4284; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4285; SSE-NEXT:    movdqa 672(%rdi), %xmm3
4286; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4287; SSE-NEXT:    movdqa 688(%rdi), %xmm0
4288; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4289; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
4290; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
4291; SSE-NEXT:    movdqa 752(%rdi), %xmm1
4292; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4293; SSE-NEXT:    movdqa 720(%rdi), %xmm0
4294; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4295; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
4296; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4297; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
4298; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4299; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4300; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
4301; SSE-NEXT:    movdqa %xmm12, %xmm3
4302; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4303; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4304; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
4305; SSE-NEXT:    movdqa 144(%rdi), %xmm1
4306; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4307; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4308; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
4309; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4310; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
4311; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
4312; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4313; SSE-NEXT:    movdqa 592(%rdi), %xmm1
4314; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4315; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4316; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
4317; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4318; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4319; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
4320; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4321; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4322; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4323; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
4324; SSE-NEXT:    movdqa 32(%rdi), %xmm4
4325; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4326; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
4327; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4328; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4329; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2]
4330; SSE-NEXT:    punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
4331; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4332; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
4333; SSE-NEXT:    movdqa 480(%rdi), %xmm4
4334; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4335; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
4336; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
4337; SSE-NEXT:    movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4338; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4339; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
4340; SSE-NEXT:    movdqa %xmm11, %xmm4
4341; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
4342; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4343; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
4344; SSE-NEXT:    movdqa 368(%rdi), %xmm11
4345; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
4346; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4347; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
4348; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4349; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2]
4350; SSE-NEXT:    movdqa %xmm15, %xmm4
4351; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
4352; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4353; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
4354; SSE-NEXT:    movdqa 816(%rdi), %xmm7
4355; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4356; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
4357; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
4358; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4359; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4360; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
4361; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
4362; SSE-NEXT:    movdqa (%rsp), %xmm9 # 16-byte Reload
4363; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[1,1,1,1]
4364; SSE-NEXT:    movdqa 256(%rdi), %xmm13
4365; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1]
4366; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1]
4367; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4368; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4369; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2]
4370; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4371; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
4372; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4373; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
4374; SSE-NEXT:    movdqa 704(%rdi), %xmm0
4375; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4376; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
4377; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1]
4378; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4379; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
4380; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4381; SSE-NEXT:    # xmm8 = mem[1,1,1,1]
4382; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4383; SSE-NEXT:    movdqa 176(%rdi), %xmm0
4384; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4385; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm14[2,3,2,3]
4386; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1]
4387; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
4388; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1]
4389; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4390; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
4391; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4392; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1]
4393; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4394; SSE-NEXT:    movdqa 64(%rdi), %xmm3
4395; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[2,3,2,3]
4396; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
4397; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
4398; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
4399; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4400; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
4401; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1]
4402; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4403; SSE-NEXT:    movdqa 400(%rdi), %xmm1
4404; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4405; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3]
4406; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4407; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
4408; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
4409; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4410; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3]
4411; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm13[1,1,1,1]
4412; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4413; SSE-NEXT:    movdqa 288(%rdi), %xmm10
4414; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3]
4415; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1]
4416; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4417; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
4418; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
4419; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4420; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4421; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
4422; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4423; SSE-NEXT:    # xmm8 = mem[1,1,1,1]
4424; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4425; SSE-NEXT:    movdqa 624(%rdi), %xmm1
4426; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4427; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4428; SSE-NEXT:    # xmm8 = mem[2,3,2,3]
4429; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4430; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
4431; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
4432; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4433; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4434; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3]
4435; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4436; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1]
4437; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4438; SSE-NEXT:    movdqa 512(%rdi), %xmm1
4439; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4440; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4441; SSE-NEXT:    # xmm8 = mem[2,3,2,3]
4442; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4443; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
4444; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
4445; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4446; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3]
4447; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4448; SSE-NEXT:    # xmm8 = mem[1,1,1,1]
4449; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4450; SSE-NEXT:    movdqa 848(%rdi), %xmm1
4451; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4452; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4453; SSE-NEXT:    # xmm8 = mem[2,3,2,3]
4454; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4455; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
4456; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
4457; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4458; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4459; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
4460; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4461; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1]
4462; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4463; SSE-NEXT:    movdqa 736(%rdi), %xmm2
4464; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3]
4465; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
4466; SSE-NEXT:    movdqa %xmm2, %xmm12
4467; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4468; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
4469; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
4470; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4471; SSE-NEXT:    movdqa 96(%rdi), %xmm1
4472; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4473; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1]
4474; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4475; SSE-NEXT:    movdqa %xmm3, %xmm1
4476; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
4477; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4478; SSE-NEXT:    # xmm5 = mem[2,2,3,3]
4479; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
4480; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
4481; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4482; SSE-NEXT:    movdqa 208(%rdi), %xmm0
4483; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4484; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1]
4485; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4486; SSE-NEXT:    movdqa %xmm4, %xmm2
4487; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
4488; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4489; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
4490; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4491; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
4492; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
4493; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4494; SSE-NEXT:    movdqa 320(%rdi), %xmm0
4495; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4496; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4497; SSE-NEXT:    movdqa %xmm10, %xmm2
4498; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4499; SSE-NEXT:    pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload
4500; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
4501; SSE-NEXT:    movdqa %xmm13, %xmm10
4502; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
4503; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
4504; SSE-NEXT:    movapd %xmm2, (%rsp) # 16-byte Spill
4505; SSE-NEXT:    movdqa 432(%rdi), %xmm0
4506; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4507; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4508; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4509; SSE-NEXT:    movdqa %xmm13, %xmm2
4510; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4511; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4512; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
4513; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4514; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3]
4515; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
4516; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4517; SSE-NEXT:    movdqa 544(%rdi), %xmm0
4518; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4519; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4520; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4521; SSE-NEXT:    movdqa %xmm5, %xmm2
4522; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4523; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3]
4524; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
4525; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
4526; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4527; SSE-NEXT:    movdqa 656(%rdi), %xmm0
4528; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4529; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4530; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4531; SSE-NEXT:    movdqa %xmm7, %xmm2
4532; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4533; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4534; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
4535; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4536; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
4537; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
4538; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4539; SSE-NEXT:    movdqa 768(%rdi), %xmm0
4540; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4541; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4542; SSE-NEXT:    movdqa %xmm12, %xmm2
4543; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4544; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3]
4545; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3]
4546; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
4547; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4548; SSE-NEXT:    movdqa 880(%rdi), %xmm0
4549; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4550; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4551; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4552; SSE-NEXT:    movdqa %xmm11, %xmm0
4553; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4554; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4555; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
4556; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4557; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3]
4558; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4559; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4560; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4561; SSE-NEXT:    # xmm1 = mem[3,3,3,3]
4562; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4563; SSE-NEXT:    movdqa %xmm6, %xmm2
4564; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4565; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
4566; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4567; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
4568; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4569; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4570; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3]
4571; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4572; SSE-NEXT:    movdqa %xmm8, %xmm2
4573; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4574; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3]
4575; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4576; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
4577; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4578; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4579; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3]
4580; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4581; SSE-NEXT:    movdqa %xmm9, %xmm1
4582; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4583; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4584; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
4585; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4586; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4587; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4588; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4589; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3]
4590; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4591; SSE-NEXT:    movdqa %xmm10, %xmm1
4592; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4593; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
4594; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4595; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4596; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4597; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4598; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4599; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
4600; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4601; SSE-NEXT:    movdqa %xmm13, %xmm1
4602; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4603; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
4604; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4605; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
4606; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4607; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4608; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3]
4609; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4610; SSE-NEXT:    movdqa %xmm14, %xmm1
4611; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4612; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3]
4613; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4614; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4615; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4616; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4617; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4618; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
4619; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4620; SSE-NEXT:    movdqa %xmm15, %xmm1
4621; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4622; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4623; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
4624; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4625; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4626; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4627; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4628; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3]
4629; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4630; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4631; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
4632; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4633; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
4634; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4635; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4636; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
4637; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4638; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4639; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
4640; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4641; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4642; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4643; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4644; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
4645; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4646; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4647; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
4648; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4649; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
4650; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4651; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4652; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
4653; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4654; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4655; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
4656; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4657; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4658; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4659; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4660; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4661; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
4662; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4663; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4664; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4665; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4666; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
4667; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4668; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4669; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
4670; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4671; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4672; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
4673; SSE-NEXT:    movdqa %xmm13, %xmm5
4674; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4675; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
4676; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4677; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4678; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4679; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
4680; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4681; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4682; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
4683; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4684; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
4685; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4686; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4687; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4688; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
4689; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4690; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4691; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
4692; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4693; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
4694; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4695; SSE-NEXT:    movapd %xmm1, %xmm15
4696; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
4697; SSE-NEXT:    movdqa %xmm11, %xmm1
4698; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4699; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4700; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
4701; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4702; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
4703; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4704; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4705; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4706; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
4707; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4708; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
4709; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4710; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4711; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
4712; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4713; SSE-NEXT:    # xmm9 = mem[0,0,1,1]
4714; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
4715; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
4716; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
4717; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3]
4718; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4719; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4720; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
4721; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4722; SSE-NEXT:    # xmm8 = mem[0,0,1,1]
4723; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
4724; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
4725; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
4726; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4727; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
4728; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4729; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4730; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
4731; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4732; SSE-NEXT:    # xmm7 = mem[0,0,1,1]
4733; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
4734; SSE-NEXT:    movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
4735; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4736; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4737; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
4738; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4739; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4740; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
4741; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4742; SSE-NEXT:    # xmm6 = mem[0,0,1,1]
4743; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
4744; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
4745; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
4746; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
4747; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4748; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4749; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
4750; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4751; SSE-NEXT:    # xmm5 = mem[0,0,1,1]
4752; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
4753; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
4754; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
4755; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4756; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
4757; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4758; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3]
4759; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4760; SSE-NEXT:    # xmm4 = mem[0,0,1,1]
4761; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
4762; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
4763; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
4764; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4765; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
4766; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4767; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3]
4768; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4769; SSE-NEXT:    # xmm3 = mem[0,0,1,1]
4770; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4771; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
4772; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
4773; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4774; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
4775; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4776; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4777; SSE-NEXT:    # xmm2 = mem[2,3,2,3]
4778; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4779; SSE-NEXT:    # xmm0 = mem[0,0,1,1]
4780; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4781; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4782; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4783; SSE-NEXT:    movaps %xmm1, 96(%rsi)
4784; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4785; SSE-NEXT:    movaps %xmm1, 32(%rsi)
4786; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4787; SSE-NEXT:    movaps %xmm1, 112(%rsi)
4788; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4789; SSE-NEXT:    movaps %xmm1, 48(%rsi)
4790; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4791; SSE-NEXT:    movaps %xmm1, 64(%rsi)
4792; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4793; SSE-NEXT:    movaps %xmm1, (%rsi)
4794; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4795; SSE-NEXT:    movaps %xmm1, 80(%rsi)
4796; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4797; SSE-NEXT:    movaps %xmm1, 16(%rsi)
4798; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4799; SSE-NEXT:    movaps %xmm1, 96(%rdx)
4800; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4801; SSE-NEXT:    movaps %xmm1, 32(%rdx)
4802; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4803; SSE-NEXT:    movaps %xmm1, 112(%rdx)
4804; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4805; SSE-NEXT:    movaps %xmm1, 48(%rdx)
4806; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4807; SSE-NEXT:    movaps %xmm1, 64(%rdx)
4808; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4809; SSE-NEXT:    movaps %xmm1, (%rdx)
4810; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4811; SSE-NEXT:    movaps %xmm1, 80(%rdx)
4812; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4813; SSE-NEXT:    movaps %xmm1, 16(%rdx)
4814; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4815; SSE-NEXT:    movaps %xmm1, 96(%rcx)
4816; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4817; SSE-NEXT:    movaps %xmm1, 112(%rcx)
4818; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4819; SSE-NEXT:    movaps %xmm1, 64(%rcx)
4820; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4821; SSE-NEXT:    movaps %xmm1, 80(%rcx)
4822; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4823; SSE-NEXT:    movaps %xmm1, 32(%rcx)
4824; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4825; SSE-NEXT:    movaps %xmm1, 48(%rcx)
4826; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4827; SSE-NEXT:    movaps %xmm1, (%rcx)
4828; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4829; SSE-NEXT:    movaps %xmm1, 16(%rcx)
4830; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4831; SSE-NEXT:    movaps %xmm1, 112(%r8)
4832; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4833; SSE-NEXT:    movaps %xmm1, 96(%r8)
4834; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4835; SSE-NEXT:    movaps %xmm1, 80(%r8)
4836; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4837; SSE-NEXT:    movaps %xmm1, 64(%r8)
4838; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4839; SSE-NEXT:    movaps %xmm1, 48(%r8)
4840; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
4841; SSE-NEXT:    movaps %xmm1, 32(%r8)
4842; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4843; SSE-NEXT:    movaps %xmm1, 16(%r8)
4844; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4845; SSE-NEXT:    movaps %xmm1, (%r8)
4846; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4847; SSE-NEXT:    movaps %xmm1, 112(%r9)
4848; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4849; SSE-NEXT:    movaps %xmm1, 96(%r9)
4850; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4851; SSE-NEXT:    movaps %xmm1, 80(%r9)
4852; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4853; SSE-NEXT:    movaps %xmm1, 64(%r9)
4854; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4855; SSE-NEXT:    movaps %xmm1, 48(%r9)
4856; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4857; SSE-NEXT:    movaps %xmm1, 32(%r9)
4858; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4859; SSE-NEXT:    movaps %xmm1, 16(%r9)
4860; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4861; SSE-NEXT:    movaps %xmm1, (%r9)
4862; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4863; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4864; SSE-NEXT:    movaps %xmm1, 112(%rax)
4865; SSE-NEXT:    movapd %xmm15, 96(%rax)
4866; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4867; SSE-NEXT:    movaps %xmm1, 80(%rax)
4868; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4869; SSE-NEXT:    movaps %xmm1, 64(%rax)
4870; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4871; SSE-NEXT:    movaps %xmm1, 48(%rax)
4872; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4873; SSE-NEXT:    movaps %xmm1, 32(%rax)
4874; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4875; SSE-NEXT:    movaps %xmm1, 16(%rax)
4876; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4877; SSE-NEXT:    movaps %xmm1, (%rax)
4878; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4879; SSE-NEXT:    movapd %xmm0, 112(%rax)
4880; SSE-NEXT:    movapd %xmm3, 96(%rax)
4881; SSE-NEXT:    movapd %xmm4, 80(%rax)
4882; SSE-NEXT:    movapd %xmm5, 64(%rax)
4883; SSE-NEXT:    movapd %xmm6, 48(%rax)
4884; SSE-NEXT:    movapd %xmm7, 32(%rax)
4885; SSE-NEXT:    movapd %xmm8, 16(%rax)
4886; SSE-NEXT:    movapd %xmm9, (%rax)
4887; SSE-NEXT:    addq $1160, %rsp # imm = 0x488
4888; SSE-NEXT:    retq
4889;
4890; AVX-LABEL: load_i32_stride7_vf32:
4891; AVX:       # %bb.0:
4892; AVX-NEXT:    subq $1432, %rsp # imm = 0x598
4893; AVX-NEXT:    vmovaps 480(%rdi), %ymm4
4894; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4895; AVX-NEXT:    vmovaps 448(%rdi), %ymm3
4896; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4897; AVX-NEXT:    vmovaps 544(%rdi), %ymm5
4898; AVX-NEXT:    vmovaps 32(%rdi), %ymm2
4899; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4900; AVX-NEXT:    vmovaps (%rdi), %ymm1
4901; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4902; AVX-NEXT:    vmovaps 96(%rdi), %ymm12
4903; AVX-NEXT:    vmovaps 80(%rdi), %xmm0
4904; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4905; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
4906; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4907; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
4908; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
4909; AVX-NEXT:    vmovaps (%rdi), %xmm8
4910; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
4911; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4912; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
4913; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4914; AVX-NEXT:    vmovaps 160(%rdi), %xmm2
4915; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4916; AVX-NEXT:    vmovaps 128(%rdi), %xmm1
4917; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4918; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
4919; AVX-NEXT:    vmovaps 192(%rdi), %xmm7
4920; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[1]
4921; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4922; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
4923; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4924; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4925; AVX-NEXT:    vmovaps 528(%rdi), %xmm0
4926; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4927; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
4928; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
4929; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
4930; AVX-NEXT:    vmovaps 448(%rdi), %xmm10
4931; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
4932; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
4933; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
4934; AVX-NEXT:    vmovaps 608(%rdi), %xmm2
4935; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4936; AVX-NEXT:    vmovaps 576(%rdi), %xmm1
4937; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4938; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
4939; AVX-NEXT:    vmovaps 640(%rdi), %xmm9
4940; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1]
4941; AVX-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4942; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
4943; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4944; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4945; AVX-NEXT:    vmovaps 256(%rdi), %ymm1
4946; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4947; AVX-NEXT:    vmovaps 224(%rdi), %ymm0
4948; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4949; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
4950; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
4951; AVX-NEXT:    vmovaps 224(%rdi), %xmm11
4952; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3]
4953; AVX-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4954; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
4955; AVX-NEXT:    vmovaps 320(%rdi), %ymm4
4956; AVX-NEXT:    vmovaps 304(%rdi), %xmm1
4957; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4958; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
4959; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4960; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4961; AVX-NEXT:    vmovaps 384(%rdi), %xmm1
4962; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4963; AVX-NEXT:    vmovaps 352(%rdi), %xmm2
4964; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4965; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
4966; AVX-NEXT:    vmovaps 416(%rdi), %xmm3
4967; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm3[1]
4968; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4969; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
4970; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4971; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4972; AVX-NEXT:    vmovaps 704(%rdi), %ymm1
4973; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4974; AVX-NEXT:    vmovaps 672(%rdi), %ymm0
4975; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4976; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
4977; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
4978; AVX-NEXT:    vmovaps 672(%rdi), %xmm1
4979; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4980; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4981; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
4982; AVX-NEXT:    vmovaps 768(%rdi), %ymm14
4983; AVX-NEXT:    vmovaps 752(%rdi), %xmm1
4984; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4985; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2]
4986; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4987; AVX-NEXT:    vmovaps 832(%rdi), %xmm2
4988; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4989; AVX-NEXT:    vmovaps 800(%rdi), %xmm1
4990; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4991; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
4992; AVX-NEXT:    vmovaps 864(%rdi), %xmm6
4993; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[1]
4994; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
4995; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4996; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4997; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
4998; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4999; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm12[1,1],ymm0[2,2],ymm12[5,5],ymm0[6,6]
5000; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
5001; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
5002; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5003; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3]
5004; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
5005; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
5006; AVX-NEXT:    vmovaps 160(%rdi), %ymm1
5007; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5008; AVX-NEXT:    vmovaps 128(%rdi), %ymm15
5009; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[0,1]
5010; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[0,0],ymm1[3,3],ymm15[4,4],ymm1[7,7]
5011; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5012; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
5013; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[2]
5014; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
5015; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5016; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5017; AVX-NEXT:    vmovaps 512(%rdi), %ymm0
5018; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5019; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6]
5020; AVX-NEXT:    vmovaps %ymm5, %ymm7
5021; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
5022; AVX-NEXT:    vmovaps 480(%rdi), %xmm1
5023; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5024; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm10[1],xmm1[2,3]
5025; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5026; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3]
5027; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3,4,5,6,7]
5028; AVX-NEXT:    vmovaps 608(%rdi), %ymm0
5029; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5030; AVX-NEXT:    vmovaps 576(%rdi), %ymm12
5031; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm0[0,1]
5032; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm12[0,0],ymm5[3,3],ymm12[4,4],ymm5[7,7]
5033; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
5034; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm9[2]
5035; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
5036; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
5037; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5038; AVX-NEXT:    vmovaps 288(%rdi), %ymm0
5039; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5040; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6]
5041; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
5042; AVX-NEXT:    vmovaps 256(%rdi), %xmm0
5043; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5044; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm0[0],xmm11[1],xmm0[2,3]
5045; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0],mem[3,3]
5046; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
5047; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
5048; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5049; AVX-NEXT:    vmovaps 352(%rdi), %ymm0
5050; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5051; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[0,1]
5052; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm5[3,3],ymm0[4,4],ymm5[7,7]
5053; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
5054; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm3[2]
5055; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
5056; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
5057; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5058; AVX-NEXT:    vmovaps 736(%rdi), %ymm5
5059; AVX-NEXT:    vmovaps %ymm14, %ymm3
5060; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5061; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[1,1],ymm5[2,2],ymm14[5,5],ymm5[6,6]
5062; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5063; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
5064; AVX-NEXT:    vmovaps 704(%rdi), %xmm4
5065; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5066; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm4[0],xmm1[1],xmm4[2,3]
5067; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5068; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3]
5069; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3,4,5,6,7]
5070; AVX-NEXT:    vmovaps 832(%rdi), %ymm13
5071; AVX-NEXT:    vmovaps 800(%rdi), %ymm2
5072; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],ymm13[0,1]
5073; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5074; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm14[3,3],ymm2[4,4],ymm14[7,7]
5075; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5076; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
5077; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm6[2]
5078; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
5079; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
5080; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5081; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
5082; AVX-NEXT:    # xmm11 = mem[2,3,2,3]
5083; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5084; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3]
5085; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5086; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5087; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm9[0,3],ymm14[7,5],ymm9[4,7]
5088; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5089; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm8[2,1],ymm14[2,0],ymm8[6,5],ymm14[6,4]
5090; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
5091; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
5092; AVX-NEXT:    # ymm14 = ymm15[0],mem[0],ymm15[2],mem[2]
5093; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
5094; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
5095; AVX-NEXT:    # xmm14 = xmm14[0,1,2],mem[3]
5096; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
5097; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
5098; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5099; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3]
5100; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5101; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3]
5102; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5103; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5104; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm10[0,3],ymm14[7,5],ymm10[4,7]
5105; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4]
5106; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
5107; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload
5108; AVX-NEXT:    # ymm14 = ymm12[0],mem[0],ymm12[2],mem[2]
5109; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
5110; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
5111; AVX-NEXT:    # xmm14 = xmm14[0,1,2],mem[3]
5112; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
5113; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
5114; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5115; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm1[2,3,2,3]
5116; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0],xmm4[1],xmm11[2,3]
5117; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5118; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7]
5119; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm3[2,1],ymm14[2,0],ymm3[6,5],ymm14[6,4]
5120; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
5121; AVX-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm13[0],ymm2[2],ymm13[2]
5122; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
5123; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm6[3]
5124; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
5125; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5,6,7]
5126; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5127; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5128; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm6[2,3,2,3]
5129; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5130; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
5131; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5132; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5133; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,1],ymm3[0,3],ymm11[7,5],ymm3[4,7]
5134; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5135; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm2[2,1],ymm11[2,0],ymm2[6,5],ymm11[6,4]
5136; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3,4,5,6,7]
5137; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5138; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5139; AVX-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm5[0],ymm13[2],ymm5[2]
5140; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm11
5141; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
5142; AVX-NEXT:    # xmm11 = xmm11[0,1,2],mem[3]
5143; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
5144; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7]
5145; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5146; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm8[0,0],ymm9[5,4],ymm8[4,4]
5147; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[3,1],ymm4[0,2],ymm8[7,5],ymm4[4,6]
5148; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
5149; AVX-NEXT:    # xmm9 = xmm0[0,1,2],mem[3]
5150; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3]
5151; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5,6,7]
5152; AVX-NEXT:    vmovaps 192(%rdi), %ymm0
5153; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5154; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5155; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm14[1,3],ymm0[4,5],ymm14[5,7]
5156; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5157; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm11[2,0],ymm0[4,6],ymm11[6,4]
5158; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7]
5159; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5160; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4]
5161; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm4[0,2],ymm7[7,5],ymm4[4,6]
5162; AVX-NEXT:    vmovaps %xmm15, %xmm10
5163; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload
5164; AVX-NEXT:    # xmm8 = xmm15[0,1,2],mem[3]
5165; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3]
5166; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3,4,5,6,7]
5167; AVX-NEXT:    vmovaps 640(%rdi), %ymm0
5168; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5169; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5170; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm4[1,3],ymm0[4,5],ymm4[5,7]
5171; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[2,0],ymm12[4,6],ymm11[6,4]
5172; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7]
5173; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5174; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[0,0],ymm3[5,4],ymm2[4,4]
5175; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm2[3,1],ymm7[0,2],ymm2[7,5],ymm7[4,6]
5176; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm1[0,1,2],xmm6[3]
5177; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3]
5178; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
5179; AVX-NEXT:    vmovaps 416(%rdi), %ymm15
5180; AVX-NEXT:    vmovaps %ymm5, %ymm8
5181; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm15[0,1],ymm5[1,3],ymm15[4,5],ymm5[5,7]
5182; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[0,2],ymm11[2,0],ymm13[4,6],ymm11[6,4]
5183; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7]
5184; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5185; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5186; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5187; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
5188; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm5[0,2],ymm0[7,5],ymm5[4,6]
5189; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5190; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
5191; AVX-NEXT:    # xmm6 = mem[0,1,2],xmm0[3]
5192; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3]
5193; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3,4,5,6,7]
5194; AVX-NEXT:    vmovaps 864(%rdi), %ymm0
5195; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5196; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5197; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[0,1],ymm9[1,3],ymm0[4,5],ymm9[5,7]
5198; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5199; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[0,2],ymm7[2,0],ymm0[4,6],ymm7[6,4]
5200; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7]
5201; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5202; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm12[2,3,0,1]
5203; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm6[0,0],ymm12[7,4],ymm6[4,4]
5204; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5205; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4]
5206; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,0],ymm0[6,4],ymm6[6,4]
5207; AVX-NEXT:    vmovaps 544(%rdi), %xmm1
5208; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5209; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1]
5210; AVX-NEXT:    vmovaps 512(%rdi), %xmm7
5211; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm7[0,1,2],xmm11[3]
5212; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3]
5213; AVX-NEXT:    vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3]
5214; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3]
5215; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5216; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5217; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5218; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
5219; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
5220; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5221; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4]
5222; AVX-NEXT:    vmovaps %ymm14, %ymm6
5223; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
5224; AVX-NEXT:    vmovaps 64(%rdi), %xmm1
5225; AVX-NEXT:    vmovaps 96(%rdi), %xmm4
5226; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[0,1,0,1]
5227; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5228; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3]
5229; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
5230; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
5231; AVX-NEXT:    vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
5232; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
5233; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5234; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5235; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1]
5236; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4]
5237; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[1,0],ymm8[2,0],ymm15[5,4],ymm8[6,4]
5238; AVX-NEXT:    vmovaps %ymm15, %ymm11
5239; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5240; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4]
5241; AVX-NEXT:    vmovaps 320(%rdi), %xmm2
5242; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5243; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm2[0,1,0,1]
5244; AVX-NEXT:    vmovaps 288(%rdi), %xmm3
5245; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3]
5246; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5247; AVX-NEXT:    # xmm15 = mem[2,3,2,3]
5248; AVX-NEXT:    vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3]
5249; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
5250; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
5251; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5252; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5253; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
5254; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4]
5255; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5256; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
5257; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4]
5258; AVX-NEXT:    vmovaps 768(%rdi), %xmm0
5259; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5260; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1]
5261; AVX-NEXT:    vmovaps 736(%rdi), %xmm2
5262; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3]
5263; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5264; AVX-NEXT:    # xmm13 = mem[2,3,2,3]
5265; AVX-NEXT:    vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3]
5266; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
5267; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7]
5268; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5269; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm10[2,1],ymm6[3,3],ymm10[6,5],ymm6[7,7]
5270; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5271; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
5272; AVX-NEXT:    # xmm14 = xmm0[0],mem[1],xmm0[2,3]
5273; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
5274; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
5275; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
5276; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5277; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5278; AVX-NEXT:    # ymm14 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4]
5279; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
5280; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[3,2]
5281; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
5282; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5283; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
5284; AVX-NEXT:    # ymm13 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7]
5285; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5286; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5287; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0],xmm10[1],xmm15[2,3]
5288; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
5289; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
5290; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5291; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3]
5292; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5293; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5294; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm4[1,0],ymm0[4,4],ymm4[5,4]
5295; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
5296; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm14[2,0],xmm7[3,2]
5297; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm13[4,5,6,7]
5298; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5299; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload
5300; AVX-NEXT:    # ymm13 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7]
5301; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5302; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload
5303; AVX-NEXT:    # xmm14 = mem[0],xmm7[1],mem[2,3]
5304; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
5305; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
5306; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
5307; AVX-NEXT:    # xmm3 = mem[0,1,2],xmm3[3]
5308; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5309; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
5310; AVX-NEXT:    # ymm14 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4]
5311; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
5312; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm14[2,0],xmm3[3,2]
5313; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm13[4,5,6,7]
5314; AVX-NEXT:    vmovaps %ymm8, %ymm7
5315; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[2,1],ymm9[3,3],ymm8[6,5],ymm9[7,7]
5316; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5317; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5318; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm9[0],xmm8[1],xmm9[2,3]
5319; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
5320; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4]
5321; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5322; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
5323; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5324; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5325; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,0],ymm14[1,0],ymm1[4,4],ymm14[5,4]
5326; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
5327; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[2,0],xmm2[3,2]
5328; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5329; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5330; AVX-NEXT:    # xmm2 = mem[0,1,0,1]
5331; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3]
5332; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
5333; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
5334; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,3]
5335; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5336; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
5337; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
5338; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm10[2,3,2,3]
5339; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3]
5340; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
5341; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
5342; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5343; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5344; AVX-NEXT:    # xmm2 = mem[0,1,0,1]
5345; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm5[3]
5346; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4]
5347; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
5348; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[2,3]
5349; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1]
5350; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm2[0,0],ymm7[7,4],ymm2[4,4]
5351; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3]
5352; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3]
5353; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
5354; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4]
5355; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5356; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5357; AVX-NEXT:    # xmm2 = mem[0,1,0,1]
5358; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5359; AVX-NEXT:    # xmm2 = xmm2[0,1,2],mem[3]
5360; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5361; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
5362; AVX-NEXT:    # ymm4 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4]
5363; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
5364; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm4[2,0],xmm2[2,3]
5365; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5366; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
5367; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm4[0,0],ymm1[7,4],ymm4[4,4]
5368; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5369; AVX-NEXT:    # xmm5 = mem[2,3,2,3]
5370; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
5371; AVX-NEXT:    # xmm5 = xmm5[0],mem[1],xmm5[2,3]
5372; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
5373; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4]
5374; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
5375; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5376; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
5377; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5378; AVX-NEXT:    # xmm4 = xmm4[0,1,2],mem[3]
5379; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
5380; AVX-NEXT:    # ymm5 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4]
5381; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
5382; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3]
5383; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5384; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1]
5385; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[3,0],ymm5[0,0],ymm1[7,4],ymm5[4,4]
5386; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
5387; AVX-NEXT:    # xmm6 = mem[2,3,2,3]
5388; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
5389; AVX-NEXT:    # xmm6 = xmm6[0],mem[1],xmm6[2,3]
5390; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
5391; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4]
5392; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5393; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5394; AVX-NEXT:    vmovaps %ymm1, 96(%rsi)
5395; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5396; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
5397; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5398; AVX-NEXT:    vmovaps %ymm5, 64(%rsi)
5399; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5400; AVX-NEXT:    vmovaps %ymm5, (%rsi)
5401; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5402; AVX-NEXT:    vmovaps %ymm1, 96(%rdx)
5403; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5404; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
5405; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5406; AVX-NEXT:    vmovaps %ymm1, 64(%rdx)
5407; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5408; AVX-NEXT:    vmovaps %ymm1, (%rdx)
5409; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5410; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
5411; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5412; AVX-NEXT:    vmovaps %ymm1, 96(%rcx)
5413; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5414; AVX-NEXT:    vmovaps %ymm1, 64(%rcx)
5415; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5416; AVX-NEXT:    vmovaps %ymm1, (%rcx)
5417; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5418; AVX-NEXT:    vmovaps %ymm1, 96(%r8)
5419; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5420; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
5421; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5422; AVX-NEXT:    vmovaps %ymm1, 64(%r8)
5423; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5424; AVX-NEXT:    vmovaps %ymm1, (%r8)
5425; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5426; AVX-NEXT:    vmovaps %ymm1, 96(%r9)
5427; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5428; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
5429; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5430; AVX-NEXT:    vmovaps %ymm1, (%r9)
5431; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5432; AVX-NEXT:    vmovaps %ymm1, 64(%r9)
5433; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5434; AVX-NEXT:    vmovaps %ymm12, 96(%rax)
5435; AVX-NEXT:    vmovaps %ymm13, 32(%rax)
5436; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5437; AVX-NEXT:    vmovaps %ymm1, 64(%rax)
5438; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5439; AVX-NEXT:    vmovaps %ymm1, (%rax)
5440; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5441; AVX-NEXT:    vmovaps %ymm4, 32(%rax)
5442; AVX-NEXT:    vmovaps %ymm2, (%rax)
5443; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
5444; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
5445; AVX-NEXT:    addq $1432, %rsp # imm = 0x598
5446; AVX-NEXT:    vzeroupper
5447; AVX-NEXT:    retq
5448;
5449; AVX2-LABEL: load_i32_stride7_vf32:
5450; AVX2:       # %bb.0:
5451; AVX2-NEXT:    subq $1192, %rsp # imm = 0x4A8
5452; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm9
5453; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm4
5454; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm5
5455; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm12
5456; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm7
5457; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm8
5458; AVX2-NEXT:    vmovdqa (%rdi), %ymm14
5459; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm13
5460; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm11
5461; AVX2-NEXT:    vpbroadcastq 80(%rdi), %ymm0
5462; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7]
5463; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
5464; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7]
5465; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5466; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5467; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
5468; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
5469; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm2
5470; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm3
5471; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5472; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
5473; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5474; AVX2-NEXT:    vpbroadcastd 196(%rdi), %ymm3
5475; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
5476; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5477; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5478; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7]
5479; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5480; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5481; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
5482; AVX2-NEXT:    vpbroadcastq 528(%rdi), %ymm2
5483; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
5484; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
5485; AVX2-NEXT:    vmovdqa 576(%rdi), %xmm2
5486; AVX2-NEXT:    vmovdqa 608(%rdi), %xmm3
5487; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5488; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
5489; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5490; AVX2-NEXT:    vpbroadcastd 644(%rdi), %ymm3
5491; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
5492; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5493; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5494; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
5495; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5496; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5497; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
5498; AVX2-NEXT:    vpbroadcastq 304(%rdi), %ymm2
5499; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
5500; AVX2-NEXT:    vmovdqa %ymm9, %ymm10
5501; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5502; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
5503; AVX2-NEXT:    vmovdqa 352(%rdi), %xmm2
5504; AVX2-NEXT:    vmovdqa 384(%rdi), %xmm3
5505; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5506; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
5507; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5508; AVX2-NEXT:    vpbroadcastd 420(%rdi), %ymm3
5509; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
5510; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5511; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5512; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm2
5513; AVX2-NEXT:    vmovdqa 672(%rdi), %ymm6
5514; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7]
5515; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5516; AVX2-NEXT:    vmovdqa %ymm2, %ymm3
5517; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5518; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
5519; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm15
5520; AVX2-NEXT:    vpbroadcastq 752(%rdi), %ymm1
5521; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
5522; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
5523; AVX2-NEXT:    vmovdqa 800(%rdi), %xmm1
5524; AVX2-NEXT:    vmovdqa 832(%rdi), %xmm2
5525; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5526; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
5527; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5528; AVX2-NEXT:    vpbroadcastd 868(%rdi), %ymm2
5529; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
5530; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5531; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5532; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
5533; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5534; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm2
5535; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5536; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm1
5537; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5538; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
5539; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
5540; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5541; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm9
5542; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7]
5543; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5544; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5545; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
5546; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5547; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
5548; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
5549; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
5550; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
5551; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5552; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
5553; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5554; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm7
5555; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5556; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm2
5557; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5558; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
5559; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
5560; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5561; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm2
5562; AVX2-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
5563; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
5564; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
5565; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5566; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
5567; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
5568; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
5569; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5570; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
5571; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5572; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm8
5573; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm10
5574; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27]
5575; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5576; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5577; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
5578; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5579; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm7
5580; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
5581; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5582; AVX2-NEXT:    vmovdqa %ymm15, %ymm5
5583; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5584; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
5585; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5586; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
5587; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
5588; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
5589; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5590; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
5591; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5592; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm6
5593; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm15
5594; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27]
5595; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5596; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
5597; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5598; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm1
5599; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5600; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
5601; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
5602; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5603; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7]
5604; AVX2-NEXT:    vpermd %ymm3, %ymm0, %ymm0
5605; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
5606; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5607; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm0
5608; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
5609; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
5610; AVX2-NEXT:    vpbroadcastd 8(%rdi), %xmm2
5611; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm3
5612; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
5613; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
5614; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[2],ymm6[2]
5615; AVX2-NEXT:    vmovdqa %ymm6, %ymm11
5616; AVX2-NEXT:    vpbroadcastd 204(%rdi), %ymm4
5617; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
5618; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
5619; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5620; AVX2-NEXT:    vmovdqa 528(%rdi), %xmm0
5621; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
5622; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
5623; AVX2-NEXT:    vpbroadcastd 456(%rdi), %xmm4
5624; AVX2-NEXT:    vmovdqa 480(%rdi), %xmm2
5625; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3]
5626; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
5627; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5628; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5629; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
5630; AVX2-NEXT:    vpbroadcastd 652(%rdi), %ymm15
5631; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7]
5632; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
5633; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5634; AVX2-NEXT:    vmovdqa 752(%rdi), %xmm0
5635; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
5636; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
5637; AVX2-NEXT:    vpbroadcastd 680(%rdi), %xmm15
5638; AVX2-NEXT:    vmovdqa 704(%rdi), %xmm7
5639; AVX2-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
5640; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
5641; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
5642; AVX2-NEXT:    vpbroadcastd 876(%rdi), %ymm13
5643; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
5644; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
5645; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5646; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm0
5647; AVX2-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
5648; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5649; AVX2-NEXT:    vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
5650; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7]
5651; AVX2-NEXT:    vpbroadcastd 232(%rdi), %xmm15
5652; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm0
5653; AVX2-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
5654; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
5655; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5656; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5657; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
5658; AVX2-NEXT:    vpbroadcastd 428(%rdi), %ymm14
5659; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
5660; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
5661; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5662; AVX2-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5663; AVX2-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
5664; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
5665; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
5666; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
5667; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
5668; AVX2-NEXT:    vmovdqa %ymm11, %ymm13
5669; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5670; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7]
5671; AVX2-NEXT:    vbroadcastss 208(%rdi), %ymm11
5672; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
5673; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
5674; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5675; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5676; AVX2-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5677; AVX2-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
5678; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
5679; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
5680; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
5681; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
5682; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7]
5683; AVX2-NEXT:    vbroadcastss 656(%rdi), %ymm3
5684; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
5685; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5686; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5687; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
5688; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
5689; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
5690; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
5691; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
5692; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7]
5693; AVX2-NEXT:    vmovaps %ymm6, %ymm15
5694; AVX2-NEXT:    vbroadcastss 432(%rdi), %ymm2
5695; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
5696; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5697; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
5698; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5699; AVX2-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5700; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7]
5701; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3]
5702; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
5703; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
5704; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
5705; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5706; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5707; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm6[1,3],ymm14[4,6],ymm6[5,7]
5708; AVX2-NEXT:    vbroadcastss 880(%rdi), %ymm2
5709; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
5710; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5711; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5712; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [4,3,0,0]
5713; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5714; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5715; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
5716; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
5717; AVX2-NEXT:    vbroadcastss 548(%rdi), %xmm2
5718; AVX2-NEXT:    vmovaps 512(%rdi), %xmm7
5719; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3]
5720; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
5721; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
5722; AVX2-NEXT:    vpermps %ymm12, %ymm11, %ymm2
5723; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7]
5724; AVX2-NEXT:    vmovaps %ymm9, %ymm12
5725; AVX2-NEXT:    vbroadcastss 660(%rdi), %ymm3
5726; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
5727; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5728; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5729; AVX2-NEXT:    vbroadcastss 100(%rdi), %xmm2
5730; AVX2-NEXT:    vmovaps 64(%rdi), %xmm0
5731; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3]
5732; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5733; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
5734; AVX2-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
5735; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm3
5736; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5737; AVX2-NEXT:    vpermps %ymm10, %ymm11, %ymm3
5738; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7]
5739; AVX2-NEXT:    vbroadcastss 212(%rdi), %ymm4
5740; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
5741; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5742; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5743; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5744; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
5745; AVX2-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
5746; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm3
5747; AVX2-NEXT:    vbroadcastss 324(%rdi), %xmm4
5748; AVX2-NEXT:    vmovaps 288(%rdi), %xmm2
5749; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
5750; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
5751; AVX2-NEXT:    vpermps %ymm15, %ymm11, %ymm4
5752; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
5753; AVX2-NEXT:    vmovaps %ymm8, %ymm9
5754; AVX2-NEXT:    vbroadcastss 436(%rdi), %ymm8
5755; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
5756; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
5757; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5758; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5759; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
5760; AVX2-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
5761; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm1
5762; AVX2-NEXT:    vbroadcastss 772(%rdi), %xmm4
5763; AVX2-NEXT:    vmovaps 736(%rdi), %xmm3
5764; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
5765; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
5766; AVX2-NEXT:    vpermps %ymm14, %ymm11, %ymm4
5767; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
5768; AVX2-NEXT:    vbroadcastss 884(%rdi), %ymm8
5769; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
5770; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
5771; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5772; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
5773; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7]
5774; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
5775; AVX2-NEXT:    vbroadcastss 216(%rdi), %ymm4
5776; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
5777; AVX2-NEXT:    vmovaps 96(%rdi), %xmm10
5778; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
5779; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
5780; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
5781; AVX2-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
5782; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
5783; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
5784; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5785; AVX2-NEXT:    vmovaps 544(%rdi), %xmm4
5786; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm7[3]
5787; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
5788; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5789; AVX2-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
5790; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
5791; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
5792; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
5793; AVX2-NEXT:    # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
5794; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
5795; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
5796; AVX2-NEXT:    vbroadcastss 664(%rdi), %ymm7
5797; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
5798; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5799; AVX2-NEXT:    vmovaps 320(%rdi), %xmm12
5800; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
5801; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
5802; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5803; AVX2-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
5804; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
5805; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
5806; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7]
5807; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
5808; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
5809; AVX2-NEXT:    vbroadcastss 440(%rdi), %ymm7
5810; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
5811; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
5812; AVX2-NEXT:    vmovaps 768(%rdi), %xmm2
5813; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
5814; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
5815; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
5816; AVX2-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
5817; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
5818; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
5819; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
5820; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
5821; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
5822; AVX2-NEXT:    vbroadcastss 888(%rdi), %ymm8
5823; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
5824; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7]
5825; AVX2-NEXT:    vbroadcastss 584(%rdi), %xmm3
5826; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
5827; AVX2-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
5828; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
5829; AVX2-NEXT:    vpermps 640(%rdi), %ymm11, %ymm8
5830; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
5831; AVX2-NEXT:    vbroadcastss 528(%rdi), %ymm8
5832; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3]
5833; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
5834; AVX2-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
5835; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
5836; AVX2-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
5837; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
5838; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
5839; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5840; AVX2-NEXT:    vbroadcastss 808(%rdi), %xmm3
5841; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
5842; AVX2-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
5843; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
5844; AVX2-NEXT:    vpermps 864(%rdi), %ymm11, %ymm4
5845; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
5846; AVX2-NEXT:    vbroadcastss 752(%rdi), %ymm4
5847; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
5848; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
5849; AVX2-NEXT:    # ymm4 = mem[2,3,2,3,6,7,6,7]
5850; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5851; AVX2-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
5852; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
5853; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
5854; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5855; AVX2-NEXT:    vbroadcastss 136(%rdi), %xmm3
5856; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
5857; AVX2-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
5858; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
5859; AVX2-NEXT:    vpermps 192(%rdi), %ymm11, %ymm4
5860; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
5861; AVX2-NEXT:    vbroadcastss 80(%rdi), %ymm4
5862; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
5863; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
5864; AVX2-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
5865; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5866; AVX2-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7]
5867; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
5868; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
5869; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5870; AVX2-NEXT:    vbroadcastss 360(%rdi), %xmm4
5871; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5872; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
5873; AVX2-NEXT:    vpermps 416(%rdi), %ymm11, %ymm6
5874; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
5875; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
5876; AVX2-NEXT:    vbroadcastss 304(%rdi), %ymm6
5877; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
5878; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
5879; AVX2-NEXT:    # ymm10 = mem[2,3,2,3,6,7,6,7]
5880; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
5881; AVX2-NEXT:    # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
5882; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm10
5883; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
5884; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
5885; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5886; AVX2-NEXT:    vmovaps %ymm6, 96(%rsi)
5887; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5888; AVX2-NEXT:    vmovaps %ymm6, 32(%rsi)
5889; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5890; AVX2-NEXT:    vmovaps %ymm6, 64(%rsi)
5891; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5892; AVX2-NEXT:    vmovaps %ymm6, (%rsi)
5893; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5894; AVX2-NEXT:    vmovaps %ymm6, 96(%rdx)
5895; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5896; AVX2-NEXT:    vmovaps %ymm6, 32(%rdx)
5897; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5898; AVX2-NEXT:    vmovaps %ymm6, 64(%rdx)
5899; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5900; AVX2-NEXT:    vmovaps %ymm6, (%rdx)
5901; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5902; AVX2-NEXT:    vmovaps %ymm6, 32(%rcx)
5903; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5904; AVX2-NEXT:    vmovaps %ymm6, 96(%rcx)
5905; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5906; AVX2-NEXT:    vmovaps %ymm6, 64(%rcx)
5907; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5908; AVX2-NEXT:    vmovaps %ymm6, (%rcx)
5909; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5910; AVX2-NEXT:    vmovaps %ymm6, 96(%r8)
5911; AVX2-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
5912; AVX2-NEXT:    vmovaps %ymm6, 32(%r8)
5913; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5914; AVX2-NEXT:    vmovaps %ymm6, 64(%r8)
5915; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5916; AVX2-NEXT:    vmovaps %ymm6, (%r8)
5917; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5918; AVX2-NEXT:    vmovaps %ymm6, 96(%r9)
5919; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5920; AVX2-NEXT:    vmovaps %ymm6, 32(%r9)
5921; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5922; AVX2-NEXT:    vmovaps %ymm6, (%r9)
5923; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5924; AVX2-NEXT:    vmovaps %ymm6, 64(%r9)
5925; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5926; AVX2-NEXT:    vmovaps %ymm7, 96(%rax)
5927; AVX2-NEXT:    vmovaps %ymm5, 32(%rax)
5928; AVX2-NEXT:    vmovaps %ymm1, 64(%rax)
5929; AVX2-NEXT:    vmovaps %ymm0, (%rax)
5930; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5931; AVX2-NEXT:    vmovaps %ymm4, 32(%rax)
5932; AVX2-NEXT:    vmovaps %ymm3, (%rax)
5933; AVX2-NEXT:    vmovaps %ymm2, 96(%rax)
5934; AVX2-NEXT:    vmovaps %ymm8, 64(%rax)
5935; AVX2-NEXT:    addq $1192, %rsp # imm = 0x4A8
5936; AVX2-NEXT:    vzeroupper
5937; AVX2-NEXT:    retq
5938;
5939; AVX2-FP-LABEL: load_i32_stride7_vf32:
5940; AVX2-FP:       # %bb.0:
5941; AVX2-FP-NEXT:    subq $1192, %rsp # imm = 0x4A8
5942; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm9
5943; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm4
5944; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm5
5945; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm12
5946; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm7
5947; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm8
5948; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm14
5949; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm13
5950; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm11
5951; AVX2-FP-NEXT:    vpbroadcastq 80(%rdi), %ymm0
5952; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7]
5953; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
5954; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7]
5955; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5956; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5957; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
5958; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
5959; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm2
5960; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm3
5961; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5962; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
5963; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5964; AVX2-FP-NEXT:    vpbroadcastd 196(%rdi), %ymm3
5965; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
5966; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5967; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5968; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7]
5969; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5970; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5971; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
5972; AVX2-FP-NEXT:    vpbroadcastq 528(%rdi), %ymm2
5973; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
5974; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
5975; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %xmm2
5976; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %xmm3
5977; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5978; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
5979; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5980; AVX2-FP-NEXT:    vpbroadcastd 644(%rdi), %ymm3
5981; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
5982; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5983; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5984; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
5985; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5986; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5987; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
5988; AVX2-FP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
5989; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
5990; AVX2-FP-NEXT:    vmovdqa %ymm9, %ymm10
5991; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5992; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
5993; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %xmm2
5994; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %xmm3
5995; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5996; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
5997; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5998; AVX2-FP-NEXT:    vpbroadcastd 420(%rdi), %ymm3
5999; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6000; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
6001; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6002; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm2
6003; AVX2-FP-NEXT:    vmovdqa 672(%rdi), %ymm6
6004; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7]
6005; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6006; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm3
6007; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6008; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
6009; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm15
6010; AVX2-FP-NEXT:    vpbroadcastq 752(%rdi), %ymm1
6011; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
6012; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
6013; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %xmm1
6014; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %xmm2
6015; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6016; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
6017; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6018; AVX2-FP-NEXT:    vpbroadcastd 868(%rdi), %ymm2
6019; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6020; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
6021; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6022; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
6023; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6024; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %ymm2
6025; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6026; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm1
6027; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6028; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
6029; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
6030; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6031; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm9
6032; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7]
6033; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6034; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6035; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
6036; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6037; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
6038; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
6039; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
6040; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
6041; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6042; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
6043; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6044; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm7
6045; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6046; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm2
6047; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6048; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
6049; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
6050; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6051; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm2
6052; AVX2-FP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
6053; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
6054; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
6055; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6056; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
6057; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
6058; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
6059; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6060; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
6061; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6062; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm8
6063; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm10
6064; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27]
6065; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6066; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6067; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
6068; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6069; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %ymm7
6070; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
6071; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6072; AVX2-FP-NEXT:    vmovdqa %ymm15, %ymm5
6073; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6074; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
6075; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6076; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
6077; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
6078; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
6079; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6080; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
6081; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6082; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm6
6083; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm15
6084; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27]
6085; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6086; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
6087; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6088; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm1
6089; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6090; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
6091; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
6092; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6093; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7]
6094; AVX2-FP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
6095; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
6096; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6097; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm0
6098; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6099; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
6100; AVX2-FP-NEXT:    vpbroadcastd 8(%rdi), %xmm2
6101; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm3
6102; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
6103; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
6104; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[2],ymm6[2]
6105; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm11
6106; AVX2-FP-NEXT:    vpbroadcastd 204(%rdi), %ymm4
6107; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
6108; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
6109; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6110; AVX2-FP-NEXT:    vmovdqa 528(%rdi), %xmm0
6111; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
6112; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
6113; AVX2-FP-NEXT:    vpbroadcastd 456(%rdi), %xmm4
6114; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %xmm2
6115; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3]
6116; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
6117; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6118; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
6119; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
6120; AVX2-FP-NEXT:    vpbroadcastd 652(%rdi), %ymm15
6121; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7]
6122; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
6123; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6124; AVX2-FP-NEXT:    vmovdqa 752(%rdi), %xmm0
6125; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
6126; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
6127; AVX2-FP-NEXT:    vpbroadcastd 680(%rdi), %xmm15
6128; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %xmm7
6129; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
6130; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
6131; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
6132; AVX2-FP-NEXT:    vpbroadcastd 876(%rdi), %ymm13
6133; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
6134; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
6135; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6136; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm0
6137; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
6138; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6139; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
6140; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7]
6141; AVX2-FP-NEXT:    vpbroadcastd 232(%rdi), %xmm15
6142; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm0
6143; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
6144; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
6145; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6146; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6147; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
6148; AVX2-FP-NEXT:    vpbroadcastd 428(%rdi), %ymm14
6149; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
6150; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
6151; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6152; AVX2-FP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6153; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
6154; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
6155; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
6156; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
6157; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
6158; AVX2-FP-NEXT:    vmovdqa %ymm11, %ymm13
6159; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6160; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7]
6161; AVX2-FP-NEXT:    vbroadcastss 208(%rdi), %ymm11
6162; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
6163; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
6164; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6165; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6166; AVX2-FP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6167; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
6168; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
6169; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
6170; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
6171; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
6172; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7]
6173; AVX2-FP-NEXT:    vbroadcastss 656(%rdi), %ymm3
6174; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6175; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
6176; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6177; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
6178; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
6179; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
6180; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
6181; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
6182; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7]
6183; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm15
6184; AVX2-FP-NEXT:    vbroadcastss 432(%rdi), %ymm2
6185; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6186; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
6187; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
6188; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6189; AVX2-FP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6190; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7]
6191; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3]
6192; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
6193; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
6194; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
6195; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
6196; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6197; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm6[1,3],ymm14[4,6],ymm6[5,7]
6198; AVX2-FP-NEXT:    vbroadcastss 880(%rdi), %ymm2
6199; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6200; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
6201; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6202; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm1 = [4,3,0,0]
6203; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6204; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6205; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
6206; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
6207; AVX2-FP-NEXT:    vbroadcastss 548(%rdi), %xmm2
6208; AVX2-FP-NEXT:    vmovaps 512(%rdi), %xmm7
6209; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3]
6210; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
6211; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
6212; AVX2-FP-NEXT:    vpermps %ymm12, %ymm11, %ymm2
6213; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7]
6214; AVX2-FP-NEXT:    vmovaps %ymm9, %ymm12
6215; AVX2-FP-NEXT:    vbroadcastss 660(%rdi), %ymm3
6216; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6217; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6218; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6219; AVX2-FP-NEXT:    vbroadcastss 100(%rdi), %xmm2
6220; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm0
6221; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3]
6222; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6223; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6224; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
6225; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm3
6226; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6227; AVX2-FP-NEXT:    vpermps %ymm10, %ymm11, %ymm3
6228; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7]
6229; AVX2-FP-NEXT:    vbroadcastss 212(%rdi), %ymm4
6230; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
6231; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6232; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6233; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6234; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6235; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
6236; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm3
6237; AVX2-FP-NEXT:    vbroadcastss 324(%rdi), %xmm4
6238; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm2
6239; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
6240; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
6241; AVX2-FP-NEXT:    vpermps %ymm15, %ymm11, %ymm4
6242; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
6243; AVX2-FP-NEXT:    vmovaps %ymm8, %ymm9
6244; AVX2-FP-NEXT:    vbroadcastss 436(%rdi), %ymm8
6245; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
6246; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
6247; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6248; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6249; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6250; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
6251; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
6252; AVX2-FP-NEXT:    vbroadcastss 772(%rdi), %xmm4
6253; AVX2-FP-NEXT:    vmovaps 736(%rdi), %xmm3
6254; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
6255; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
6256; AVX2-FP-NEXT:    vpermps %ymm14, %ymm11, %ymm4
6257; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
6258; AVX2-FP-NEXT:    vbroadcastss 884(%rdi), %ymm8
6259; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
6260; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
6261; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6262; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
6263; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7]
6264; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
6265; AVX2-FP-NEXT:    vbroadcastss 216(%rdi), %ymm4
6266; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
6267; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm10
6268; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
6269; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
6270; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
6271; AVX2-FP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
6272; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
6273; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
6274; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6275; AVX2-FP-NEXT:    vmovaps 544(%rdi), %xmm4
6276; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm7[3]
6277; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
6278; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
6279; AVX2-FP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
6280; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
6281; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
6282; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
6283; AVX2-FP-NEXT:    # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
6284; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
6285; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
6286; AVX2-FP-NEXT:    vbroadcastss 664(%rdi), %ymm7
6287; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
6288; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
6289; AVX2-FP-NEXT:    vmovaps 320(%rdi), %xmm12
6290; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
6291; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
6292; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
6293; AVX2-FP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
6294; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
6295; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
6296; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7]
6297; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
6298; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
6299; AVX2-FP-NEXT:    vbroadcastss 440(%rdi), %ymm7
6300; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
6301; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
6302; AVX2-FP-NEXT:    vmovaps 768(%rdi), %xmm2
6303; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
6304; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
6305; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
6306; AVX2-FP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
6307; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
6308; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
6309; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
6310; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
6311; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
6312; AVX2-FP-NEXT:    vbroadcastss 888(%rdi), %ymm8
6313; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
6314; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7]
6315; AVX2-FP-NEXT:    vbroadcastss 584(%rdi), %xmm3
6316; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6317; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
6318; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
6319; AVX2-FP-NEXT:    vpermps 640(%rdi), %ymm11, %ymm8
6320; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
6321; AVX2-FP-NEXT:    vbroadcastss 528(%rdi), %ymm8
6322; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3]
6323; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
6324; AVX2-FP-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
6325; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
6326; AVX2-FP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
6327; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
6328; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
6329; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6330; AVX2-FP-NEXT:    vbroadcastss 808(%rdi), %xmm3
6331; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6332; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
6333; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
6334; AVX2-FP-NEXT:    vpermps 864(%rdi), %ymm11, %ymm4
6335; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
6336; AVX2-FP-NEXT:    vbroadcastss 752(%rdi), %ymm4
6337; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
6338; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
6339; AVX2-FP-NEXT:    # ymm4 = mem[2,3,2,3,6,7,6,7]
6340; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
6341; AVX2-FP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
6342; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
6343; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
6344; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6345; AVX2-FP-NEXT:    vbroadcastss 136(%rdi), %xmm3
6346; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6347; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
6348; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
6349; AVX2-FP-NEXT:    vpermps 192(%rdi), %ymm11, %ymm4
6350; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
6351; AVX2-FP-NEXT:    vbroadcastss 80(%rdi), %ymm4
6352; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
6353; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
6354; AVX2-FP-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
6355; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
6356; AVX2-FP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7]
6357; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
6358; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
6359; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6360; AVX2-FP-NEXT:    vbroadcastss 360(%rdi), %xmm4
6361; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
6362; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
6363; AVX2-FP-NEXT:    vpermps 416(%rdi), %ymm11, %ymm6
6364; AVX2-FP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
6365; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
6366; AVX2-FP-NEXT:    vbroadcastss 304(%rdi), %ymm6
6367; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
6368; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
6369; AVX2-FP-NEXT:    # ymm10 = mem[2,3,2,3,6,7,6,7]
6370; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
6371; AVX2-FP-NEXT:    # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
6372; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm10
6373; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
6374; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
6375; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6376; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rsi)
6377; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6378; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rsi)
6379; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6380; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rsi)
6381; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6382; AVX2-FP-NEXT:    vmovaps %ymm6, (%rsi)
6383; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6384; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rdx)
6385; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6386; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rdx)
6387; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6388; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rdx)
6389; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6390; AVX2-FP-NEXT:    vmovaps %ymm6, (%rdx)
6391; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6392; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rcx)
6393; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6394; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rcx)
6395; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6396; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rcx)
6397; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6398; AVX2-FP-NEXT:    vmovaps %ymm6, (%rcx)
6399; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6400; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%r8)
6401; AVX2-FP-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
6402; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%r8)
6403; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6404; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%r8)
6405; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6406; AVX2-FP-NEXT:    vmovaps %ymm6, (%r8)
6407; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6408; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%r9)
6409; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6410; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%r9)
6411; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6412; AVX2-FP-NEXT:    vmovaps %ymm6, (%r9)
6413; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6414; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%r9)
6415; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6416; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rax)
6417; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rax)
6418; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rax)
6419; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
6420; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6421; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rax)
6422; AVX2-FP-NEXT:    vmovaps %ymm3, (%rax)
6423; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rax)
6424; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rax)
6425; AVX2-FP-NEXT:    addq $1192, %rsp # imm = 0x4A8
6426; AVX2-FP-NEXT:    vzeroupper
6427; AVX2-FP-NEXT:    retq
6428;
6429; AVX2-FCP-LABEL: load_i32_stride7_vf32:
6430; AVX2-FCP:       # %bb.0:
6431; AVX2-FCP-NEXT:    subq $1224, %rsp # imm = 0x4C8
6432; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm10
6433; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
6434; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
6435; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm12
6436; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm7
6437; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm8
6438; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm14
6439; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
6440; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6441; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
6442; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6443; AVX2-FCP-NEXT:    vpbroadcastq 80(%rdi), %ymm0
6444; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6445; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
6446; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6],ymm14[7]
6447; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6448; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
6449; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
6450; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm2
6451; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm3
6452; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6453; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
6454; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6455; AVX2-FCP-NEXT:    vpbroadcastd 196(%rdi), %ymm3
6456; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6457; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
6458; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6459; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7]
6460; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6461; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6462; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
6463; AVX2-FCP-NEXT:    vpbroadcastq 528(%rdi), %ymm2
6464; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
6465; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
6466; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %xmm2
6467; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %xmm3
6468; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6469; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
6470; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6471; AVX2-FCP-NEXT:    vpbroadcastd 644(%rdi), %ymm3
6472; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6473; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
6474; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6475; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
6476; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6477; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6478; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
6479; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
6480; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
6481; AVX2-FCP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
6482; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
6483; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
6484; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm3
6485; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6486; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
6487; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6488; AVX2-FCP-NEXT:    vpbroadcastd 420(%rdi), %ymm3
6489; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6490; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
6491; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6492; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm2
6493; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm6
6494; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7]
6495; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6496; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm3
6497; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6498; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
6499; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm11
6500; AVX2-FCP-NEXT:    vpbroadcastq 752(%rdi), %ymm1
6501; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
6502; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
6503; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %xmm1
6504; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %xmm2
6505; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6506; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
6507; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6508; AVX2-FCP-NEXT:    vpbroadcastd 868(%rdi), %ymm2
6509; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6510; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
6511; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6512; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
6513; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6514; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm2
6515; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6516; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
6517; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6518; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
6519; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
6520; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6521; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm13
6522; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
6523; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6524; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6525; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
6526; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6527; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
6528; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
6529; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
6530; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
6531; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6532; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
6533; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6534; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm7
6535; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6536; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm2
6537; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6538; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
6539; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
6540; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6541; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm2
6542; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6543; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
6544; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
6545; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6546; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
6547; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
6548; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
6549; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6550; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
6551; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6552; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm9
6553; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm15
6554; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27]
6555; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6556; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6557; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
6558; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6559; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm5
6560; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6561; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm5[2,3],ymm11[4,5],ymm5[6,7]
6562; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6563; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
6564; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6565; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
6566; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
6567; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
6568; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6569; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
6570; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6571; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
6572; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
6573; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
6574; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6575; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6576; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
6577; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6578; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
6579; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6580; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
6581; AVX2-FCP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload
6582; AVX2-FCP-NEXT:    # ymm4 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7]
6583; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6584; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7]
6585; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
6586; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
6587; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6588; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
6589; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6590; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
6591; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm2
6592; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
6593; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
6594; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
6595; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
6596; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm4
6597; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
6598; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
6599; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6600; AVX2-FCP-NEXT:    vmovdqa 528(%rdi), %xmm0
6601; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
6602; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
6603; AVX2-FCP-NEXT:    vpbroadcastd 456(%rdi), %xmm4
6604; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %xmm2
6605; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3]
6606; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
6607; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6608; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6609; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
6610; AVX2-FCP-NEXT:    vpbroadcastd 652(%rdi), %ymm13
6611; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7]
6612; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
6613; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6614; AVX2-FCP-NEXT:    vmovdqa 752(%rdi), %xmm0
6615; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
6616; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
6617; AVX2-FCP-NEXT:    vpbroadcastd 680(%rdi), %xmm13
6618; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %xmm10
6619; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3]
6620; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7]
6621; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm9[0],ymm15[2],ymm9[2]
6622; AVX2-FCP-NEXT:    vpbroadcastd 876(%rdi), %ymm15
6623; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6624; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
6625; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6626; AVX2-FCP-NEXT:    vmovdqa 304(%rdi), %xmm0
6627; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6628; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm6 # 32-byte Reload
6629; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm13 = ymm6[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
6630; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7]
6631; AVX2-FCP-NEXT:    vpbroadcastd 232(%rdi), %xmm15
6632; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm0
6633; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
6634; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
6635; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6636; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6637; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
6638; AVX2-FCP-NEXT:    vpbroadcastd 428(%rdi), %ymm14
6639; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
6640; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
6641; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6642; AVX2-FCP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6643; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
6644; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
6645; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
6646; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
6647; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
6648; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
6649; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6650; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[0,2],ymm9[1,3],ymm15[4,6],ymm9[5,7]
6651; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm11
6652; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
6653; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
6654; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6655; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6656; AVX2-FCP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6657; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
6658; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
6659; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
6660; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
6661; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
6662; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm13
6663; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[0,2],ymm12[1,3],ymm8[4,6],ymm12[5,7]
6664; AVX2-FCP-NEXT:    vbroadcastss 656(%rdi), %ymm3
6665; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6666; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
6667; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6668; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7]
6669; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
6670; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
6671; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
6672; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
6673; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7]
6674; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm8
6675; AVX2-FCP-NEXT:    vbroadcastss 432(%rdi), %ymm2
6676; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6677; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
6678; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6679; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6680; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6681; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
6682; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3]
6683; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
6684; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
6685; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
6686; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6687; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6688; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm5[1,3],ymm6[4,6],ymm5[5,7]
6689; AVX2-FCP-NEXT:    vbroadcastss 880(%rdi), %ymm2
6690; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6691; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
6692; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
6693; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,3,0,0]
6694; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6695; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6696; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
6697; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
6698; AVX2-FCP-NEXT:    vpbroadcastd 548(%rdi), %xmm2
6699; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %xmm14
6700; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
6701; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
6702; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
6703; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm11, %ymm2
6704; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
6705; AVX2-FCP-NEXT:    vmovaps %ymm12, %ymm13
6706; AVX2-FCP-NEXT:    vpbroadcastd 660(%rdi), %ymm3
6707; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6708; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6709; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6710; AVX2-FCP-NEXT:    vpbroadcastd 100(%rdi), %xmm2
6711; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
6712; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3]
6713; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6714; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6715; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
6716; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
6717; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6718; AVX2-FCP-NEXT:    vmovaps %ymm15, %ymm10
6719; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm11, %ymm3
6720; AVX2-FCP-NEXT:    vmovaps %ymm9, %ymm7
6721; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
6722; AVX2-FCP-NEXT:    vpbroadcastd 212(%rdi), %ymm4
6723; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
6724; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6725; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6726; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6727; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6728; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
6729; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm3
6730; AVX2-FCP-NEXT:    vpbroadcastd 324(%rdi), %xmm4
6731; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %xmm2
6732; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
6733; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
6734; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
6735; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm11, %ymm4
6736; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
6737; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm15
6738; AVX2-FCP-NEXT:    vpbroadcastd 436(%rdi), %ymm8
6739; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
6740; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
6741; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6742; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6743; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6744; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
6745; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm1
6746; AVX2-FCP-NEXT:    vpbroadcastd 772(%rdi), %xmm4
6747; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %xmm3
6748; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
6749; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
6750; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm11, %ymm4
6751; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
6752; AVX2-FCP-NEXT:    vmovaps %ymm5, %ymm12
6753; AVX2-FCP-NEXT:    vpbroadcastd 884(%rdi), %ymm8
6754; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
6755; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
6756; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6757; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
6758; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7]
6759; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
6760; AVX2-FCP-NEXT:    vbroadcastss 216(%rdi), %ymm6
6761; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
6762; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm6
6763; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
6764; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
6765; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
6766; AVX2-FCP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
6767; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
6768; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
6769; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6770; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %xmm8
6771; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm14[3]
6772; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
6773; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
6774; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
6775; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
6776; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
6777; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload
6778; AVX2-FCP-NEXT:    # ymm5 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
6779; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm4, %ymm5
6780; AVX2-FCP-NEXT:    vbroadcastss 664(%rdi), %ymm7
6781; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
6782; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
6783; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %xmm13
6784; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
6785; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
6786; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
6787; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
6788; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
6789; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
6790; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
6791; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm4, %ymm5
6792; AVX2-FCP-NEXT:    vbroadcastss 440(%rdi), %ymm7
6793; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
6794; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
6795; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %xmm2
6796; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
6797; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2]
6798; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
6799; AVX2-FCP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
6800; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
6801; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
6802; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
6803; AVX2-FCP-NEXT:    # ymm7 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
6804; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
6805; AVX2-FCP-NEXT:    vbroadcastss 888(%rdi), %ymm7
6806; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
6807; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7]
6808; AVX2-FCP-NEXT:    vbroadcastss 584(%rdi), %xmm3
6809; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6810; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
6811; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
6812; AVX2-FCP-NEXT:    vpermd 640(%rdi), %ymm11, %ymm4
6813; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
6814; AVX2-FCP-NEXT:    vpbroadcastd 528(%rdi), %ymm4
6815; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
6816; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
6817; AVX2-FCP-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
6818; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
6819; AVX2-FCP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
6820; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
6821; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
6822; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6823; AVX2-FCP-NEXT:    vbroadcastss 808(%rdi), %xmm3
6824; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6825; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
6826; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
6827; AVX2-FCP-NEXT:    vpermd 864(%rdi), %ymm11, %ymm4
6828; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
6829; AVX2-FCP-NEXT:    vpbroadcastd 752(%rdi), %ymm4
6830; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
6831; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
6832; AVX2-FCP-NEXT:    # ymm4 = mem[2,3,2,3,6,7,6,7]
6833; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
6834; AVX2-FCP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
6835; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
6836; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
6837; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6838; AVX2-FCP-NEXT:    vbroadcastss 136(%rdi), %xmm3
6839; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6840; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
6841; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
6842; AVX2-FCP-NEXT:    vpermd 192(%rdi), %ymm11, %ymm4
6843; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
6844; AVX2-FCP-NEXT:    vpbroadcastd 80(%rdi), %ymm4
6845; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
6846; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
6847; AVX2-FCP-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
6848; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
6849; AVX2-FCP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7]
6850; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
6851; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
6852; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6853; AVX2-FCP-NEXT:    vbroadcastss 360(%rdi), %xmm4
6854; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
6855; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
6856; AVX2-FCP-NEXT:    vpermd 416(%rdi), %ymm11, %ymm6
6857; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
6858; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
6859; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm6
6860; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3]
6861; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
6862; AVX2-FCP-NEXT:    # ymm10 = mem[2,3,2,3,6,7,6,7]
6863; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
6864; AVX2-FCP-NEXT:    # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
6865; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
6866; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
6867; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
6868; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6869; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rsi)
6870; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6871; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rsi)
6872; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6873; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rsi)
6874; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6875; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rsi)
6876; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6877; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rdx)
6878; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6879; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rdx)
6880; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6881; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rdx)
6882; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6883; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rdx)
6884; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6885; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rcx)
6886; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6887; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rcx)
6888; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6889; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rcx)
6890; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6891; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rcx)
6892; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
6893; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%r8)
6894; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6895; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%r8)
6896; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6897; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%r8)
6898; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6899; AVX2-FCP-NEXT:    vmovaps %ymm6, (%r8)
6900; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6901; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%r9)
6902; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6903; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%r9)
6904; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6905; AVX2-FCP-NEXT:    vmovaps %ymm6, (%r9)
6906; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6907; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%r9)
6908; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6909; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rax)
6910; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rax)
6911; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rax)
6912; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
6913; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6914; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rax)
6915; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rax)
6916; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rax)
6917; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%rax)
6918; AVX2-FCP-NEXT:    addq $1224, %rsp # imm = 0x4C8
6919; AVX2-FCP-NEXT:    vzeroupper
6920; AVX2-FCP-NEXT:    retq
6921;
6922; AVX512-LABEL: load_i32_stride7_vf32:
6923; AVX512:       # %bb.0:
6924; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6925; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
6926; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm1
6927; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm0
6928; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm4
6929; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm2
6930; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm5
6931; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm6
6932; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm3
6933; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm13
6934; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm15
6935; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm9
6936; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm10
6937; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm11
6938; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm14
6939; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm12
6940; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
6941; AVX512-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
6942; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm17
6943; AVX512-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
6944; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0]
6945; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm8
6946; AVX512-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
6947; AVX512-NEXT:    movw $992, %di # imm = 0x3E0
6948; AVX512-NEXT:    kmovw %edi, %k1
6949; AVX512-NEXT:    vmovdqa32 %zmm17, %zmm8 {%k1}
6950; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
6951; AVX512-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6952; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm18
6953; AVX512-NEXT:    vpermt2d %zmm15, %zmm17, %zmm18
6954; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
6955; AVX512-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6956; AVX512-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
6957; AVX512-NEXT:    movb $-32, %dil
6958; AVX512-NEXT:    kmovw %edi, %k2
6959; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm8 {%k2}
6960; AVX512-NEXT:    vpermi2d %zmm6, %zmm3, %zmm17
6961; AVX512-NEXT:    vpermt2d %zmm5, %zmm19, %zmm17
6962; AVX512-NEXT:    vpermi2d %zmm4, %zmm2, %zmm16
6963; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
6964; AVX512-NEXT:    vmovdqa32 %zmm16, %zmm7 {%k1}
6965; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm7 {%k2}
6966; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
6967; AVX512-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6968; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm19
6969; AVX512-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
6970; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0]
6971; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm17
6972; AVX512-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
6973; AVX512-NEXT:    movw $480, %di # imm = 0x1E0
6974; AVX512-NEXT:    kmovw %edi, %k2
6975; AVX512-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k2}
6976; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
6977; AVX512-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
6978; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm20
6979; AVX512-NEXT:    vpermt2d %zmm15, %zmm19, %zmm20
6980; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
6981; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
6982; AVX512-NEXT:    vpermt2d %zmm13, %zmm21, %zmm20
6983; AVX512-NEXT:    movw $-512, %di # imm = 0xFE00
6984; AVX512-NEXT:    kmovw %edi, %k1
6985; AVX512-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
6986; AVX512-NEXT:    vpermi2d %zmm6, %zmm3, %zmm19
6987; AVX512-NEXT:    vpermt2d %zmm5, %zmm21, %zmm19
6988; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm20
6989; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm20
6990; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
6991; AVX512-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k2}
6992; AVX512-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k1}
6993; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
6994; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6995; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm22
6996; AVX512-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
6997; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0]
6998; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm20
6999; AVX512-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
7000; AVX512-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
7001; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
7002; AVX512-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
7003; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm23
7004; AVX512-NEXT:    vpermt2d %zmm15, %zmm22, %zmm23
7005; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
7006; AVX512-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
7007; AVX512-NEXT:    vpermt2d %zmm13, %zmm24, %zmm23
7008; AVX512-NEXT:    vmovdqa32 %zmm23, %zmm20 {%k1}
7009; AVX512-NEXT:    vpermi2d %zmm6, %zmm3, %zmm22
7010; AVX512-NEXT:    vpermt2d %zmm5, %zmm24, %zmm22
7011; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm23
7012; AVX512-NEXT:    vpermt2d %zmm2, %zmm21, %zmm23
7013; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
7014; AVX512-NEXT:    vmovdqa32 %zmm23, %zmm19 {%k2}
7015; AVX512-NEXT:    vmovdqa32 %zmm22, %zmm19 {%k1}
7016; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
7017; AVX512-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7018; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm25
7019; AVX512-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
7020; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0]
7021; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm23
7022; AVX512-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
7023; AVX512-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
7024; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
7025; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
7026; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm26
7027; AVX512-NEXT:    vpermt2d %zmm9, %zmm25, %zmm26
7028; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
7029; AVX512-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7030; AVX512-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7031; AVX512-NEXT:    vmovdqa32 %zmm26, %zmm23 {%k1}
7032; AVX512-NEXT:    vpermi2d %zmm3, %zmm6, %zmm25
7033; AVX512-NEXT:    vpermt2d %zmm5, %zmm27, %zmm25
7034; AVX512-NEXT:    vpermi2d %zmm2, %zmm4, %zmm24
7035; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm22
7036; AVX512-NEXT:    vmovdqa32 %zmm24, %zmm22 {%k2}
7037; AVX512-NEXT:    vmovdqa32 %zmm25, %zmm22 {%k1}
7038; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
7039; AVX512-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
7040; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm25
7041; AVX512-NEXT:    vpermt2d %zmm9, %zmm26, %zmm25
7042; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
7043; AVX512-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7044; AVX512-NEXT:    vpermt2d %zmm13, %zmm27, %zmm25
7045; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
7046; AVX512-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7047; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm24
7048; AVX512-NEXT:    vpermt2d %zmm12, %zmm28, %zmm24
7049; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25]
7050; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm30
7051; AVX512-NEXT:    vpermt2d %zmm11, %zmm29, %zmm30
7052; AVX512-NEXT:    vinserti32x4 $0, %xmm30, %zmm24, %zmm24
7053; AVX512-NEXT:    vmovdqa32 %zmm25, %zmm24 {%k1}
7054; AVX512-NEXT:    vpermi2d %zmm3, %zmm6, %zmm26
7055; AVX512-NEXT:    vpermt2d %zmm5, %zmm27, %zmm26
7056; AVX512-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7057; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm29
7058; AVX512-NEXT:    vinserti32x4 $0, %xmm29, %zmm28, %zmm25
7059; AVX512-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
7060; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm26
7061; AVX512-NEXT:    vpermt2d %zmm15, %zmm18, %zmm26
7062; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
7063; AVX512-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7064; AVX512-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7065; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
7066; AVX512-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7067; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm29
7068; AVX512-NEXT:    vpermt2d %zmm12, %zmm28, %zmm29
7069; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26]
7070; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm31
7071; AVX512-NEXT:    vpermt2d %zmm11, %zmm30, %zmm31
7072; AVX512-NEXT:    vinserti32x4 $0, %xmm31, %zmm29, %zmm29
7073; AVX512-NEXT:    vmovdqa32 %zmm26, %zmm29 {%k1}
7074; AVX512-NEXT:    vpermi2d %zmm6, %zmm3, %zmm18
7075; AVX512-NEXT:    vpermt2d %zmm5, %zmm27, %zmm18
7076; AVX512-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7077; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm30
7078; AVX512-NEXT:    vinserti32x4 $0, %xmm30, %zmm28, %zmm26
7079; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm26 {%k1}
7080; AVX512-NEXT:    vpermt2d %zmm15, %zmm21, %zmm9
7081; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
7082; AVX512-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
7083; AVX512-NEXT:    vpermt2d %zmm13, %zmm15, %zmm9
7084; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
7085; AVX512-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
7086; AVX512-NEXT:    vpermt2d %zmm14, %zmm13, %zmm12
7087; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27]
7088; AVX512-NEXT:    vpermt2d %zmm11, %zmm14, %zmm10
7089; AVX512-NEXT:    vinserti32x4 $0, %xmm10, %zmm12, %zmm10
7090; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
7091; AVX512-NEXT:    vpermt2d %zmm6, %zmm21, %zmm3
7092; AVX512-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
7093; AVX512-NEXT:    vpermt2d %zmm4, %zmm13, %zmm2
7094; AVX512-NEXT:    vpermt2d %zmm1, %zmm14, %zmm0
7095; AVX512-NEXT:    vinserti32x4 $0, %xmm0, %zmm2, %zmm0
7096; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
7097; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
7098; AVX512-NEXT:    vmovdqa64 %zmm8, (%rsi)
7099; AVX512-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
7100; AVX512-NEXT:    vmovdqa64 %zmm17, (%rdx)
7101; AVX512-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
7102; AVX512-NEXT:    vmovdqa64 %zmm20, (%rcx)
7103; AVX512-NEXT:    vmovdqa64 %zmm22, 64(%r8)
7104; AVX512-NEXT:    vmovdqa64 %zmm23, (%r8)
7105; AVX512-NEXT:    vmovdqa64 %zmm25, 64(%r9)
7106; AVX512-NEXT:    vmovdqa64 %zmm24, (%r9)
7107; AVX512-NEXT:    vmovdqa64 %zmm26, 64(%r10)
7108; AVX512-NEXT:    vmovdqa64 %zmm29, (%r10)
7109; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
7110; AVX512-NEXT:    vmovdqa64 %zmm10, (%rax)
7111; AVX512-NEXT:    vzeroupper
7112; AVX512-NEXT:    retq
7113;
7114; AVX512-FCP-LABEL: load_i32_stride7_vf32:
7115; AVX512-FCP:       # %bb.0:
7116; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7117; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7118; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm1
7119; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
7120; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
7121; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm2
7122; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm5
7123; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm6
7124; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm3
7125; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm13
7126; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm15
7127; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
7128; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
7129; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
7130; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm14
7131; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm12
7132; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
7133; AVX512-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
7134; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm17
7135; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
7136; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0]
7137; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm8
7138; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
7139; AVX512-FCP-NEXT:    movw $992, %di # imm = 0x3E0
7140; AVX512-FCP-NEXT:    kmovw %edi, %k1
7141; AVX512-FCP-NEXT:    vmovdqa32 %zmm17, %zmm8 {%k1}
7142; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
7143; AVX512-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7144; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm18
7145; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm17, %zmm18
7146; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
7147; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7148; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
7149; AVX512-FCP-NEXT:    movb $-32, %dil
7150; AVX512-FCP-NEXT:    kmovw %edi, %k2
7151; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm8 {%k2}
7152; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm17
7153; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm19, %zmm17
7154; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm16
7155; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
7156; AVX512-FCP-NEXT:    vmovdqa32 %zmm16, %zmm7 {%k1}
7157; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm7 {%k2}
7158; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
7159; AVX512-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7160; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm19
7161; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
7162; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0]
7163; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
7164; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
7165; AVX512-FCP-NEXT:    movw $480, %di # imm = 0x1E0
7166; AVX512-FCP-NEXT:    kmovw %edi, %k2
7167; AVX512-FCP-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k2}
7168; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
7169; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7170; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm20
7171; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm19, %zmm20
7172; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
7173; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
7174; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm21, %zmm20
7175; AVX512-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
7176; AVX512-FCP-NEXT:    kmovw %edi, %k1
7177; AVX512-FCP-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
7178; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm19
7179; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm19
7180; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm20
7181; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm20
7182; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
7183; AVX512-FCP-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k2}
7184; AVX512-FCP-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k1}
7185; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
7186; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7187; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22
7188; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
7189; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0]
7190; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm20
7191; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
7192; AVX512-FCP-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
7193; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
7194; AVX512-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
7195; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm23
7196; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm22, %zmm23
7197; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
7198; AVX512-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
7199; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm24, %zmm23
7200; AVX512-FCP-NEXT:    vmovdqa32 %zmm23, %zmm20 {%k1}
7201; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm22
7202; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm24, %zmm22
7203; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm23
7204; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm21, %zmm23
7205; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
7206; AVX512-FCP-NEXT:    vmovdqa32 %zmm23, %zmm19 {%k2}
7207; AVX512-FCP-NEXT:    vmovdqa32 %zmm22, %zmm19 {%k1}
7208; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
7209; AVX512-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7210; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm25
7211; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
7212; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0]
7213; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm23
7214; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
7215; AVX512-FCP-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
7216; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
7217; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
7218; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm26
7219; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm26
7220; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
7221; AVX512-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7222; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7223; AVX512-FCP-NEXT:    vmovdqa32 %zmm26, %zmm23 {%k1}
7224; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm6, %zmm25
7225; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm25
7226; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm24
7227; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm22
7228; AVX512-FCP-NEXT:    vmovdqa32 %zmm24, %zmm22 {%k2}
7229; AVX512-FCP-NEXT:    vmovdqa32 %zmm25, %zmm22 {%k1}
7230; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
7231; AVX512-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
7232; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm25
7233; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm26, %zmm25
7234; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
7235; AVX512-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7236; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm25
7237; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
7238; AVX512-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7239; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm24
7240; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm28, %zmm24
7241; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25]
7242; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm30
7243; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm29, %zmm30
7244; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm24, %zmm24
7245; AVX512-FCP-NEXT:    vmovdqa32 %zmm25, %zmm24 {%k1}
7246; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm6, %zmm26
7247; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm26
7248; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7249; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm29
7250; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm29, %zmm28, %zmm25
7251; AVX512-FCP-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
7252; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm26
7253; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm26
7254; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
7255; AVX512-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7256; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7257; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
7258; AVX512-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7259; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm29
7260; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm28, %zmm29
7261; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26]
7262; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm31
7263; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm30, %zmm31
7264; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm31, %zmm29, %zmm29
7265; AVX512-FCP-NEXT:    vmovdqa32 %zmm26, %zmm29 {%k1}
7266; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm18
7267; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm18
7268; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7269; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm30
7270; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm28, %zmm26
7271; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm26 {%k1}
7272; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm21, %zmm9
7273; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
7274; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
7275; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm15, %zmm9
7276; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
7277; AVX512-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
7278; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm13, %zmm12
7279; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27]
7280; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm10
7281; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm12, %zmm10
7282; AVX512-FCP-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
7283; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm21, %zmm3
7284; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
7285; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm13, %zmm2
7286; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm0
7287; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm0, %zmm2, %zmm0
7288; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
7289; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
7290; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
7291; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
7292; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
7293; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
7294; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
7295; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r8)
7296; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
7297; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r9)
7298; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, (%r9)
7299; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r10)
7300; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, (%r10)
7301; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
7302; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, (%rax)
7303; AVX512-FCP-NEXT:    vzeroupper
7304; AVX512-FCP-NEXT:    retq
7305;
7306; AVX512DQ-LABEL: load_i32_stride7_vf32:
7307; AVX512DQ:       # %bb.0:
7308; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7309; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7310; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm1
7311; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm0
7312; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm4
7313; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm2
7314; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm5
7315; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm6
7316; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm3
7317; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm13
7318; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm15
7319; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm9
7320; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm10
7321; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm11
7322; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm14
7323; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm12
7324; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
7325; AVX512DQ-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
7326; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm17
7327; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
7328; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0]
7329; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm8
7330; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
7331; AVX512DQ-NEXT:    movw $992, %di # imm = 0x3E0
7332; AVX512DQ-NEXT:    kmovw %edi, %k1
7333; AVX512DQ-NEXT:    vmovdqa32 %zmm17, %zmm8 {%k1}
7334; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
7335; AVX512DQ-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7336; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm18
7337; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm17, %zmm18
7338; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
7339; AVX512DQ-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7340; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
7341; AVX512DQ-NEXT:    movb $-32, %dil
7342; AVX512DQ-NEXT:    kmovw %edi, %k2
7343; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm8 {%k2}
7344; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm3, %zmm17
7345; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm19, %zmm17
7346; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm2, %zmm16
7347; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
7348; AVX512DQ-NEXT:    vmovdqa32 %zmm16, %zmm7 {%k1}
7349; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm7 {%k2}
7350; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
7351; AVX512DQ-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7352; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm19
7353; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
7354; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0]
7355; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm17
7356; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
7357; AVX512DQ-NEXT:    movw $480, %di # imm = 0x1E0
7358; AVX512DQ-NEXT:    kmovw %edi, %k2
7359; AVX512DQ-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k2}
7360; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
7361; AVX512DQ-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7362; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm20
7363; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm19, %zmm20
7364; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
7365; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
7366; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm21, %zmm20
7367; AVX512DQ-NEXT:    movw $-512, %di # imm = 0xFE00
7368; AVX512DQ-NEXT:    kmovw %edi, %k1
7369; AVX512DQ-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
7370; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm3, %zmm19
7371; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm21, %zmm19
7372; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm20
7373; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm20
7374; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
7375; AVX512DQ-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k2}
7376; AVX512DQ-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k1}
7377; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
7378; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7379; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm22
7380; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
7381; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0]
7382; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm20
7383; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
7384; AVX512DQ-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
7385; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
7386; AVX512DQ-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
7387; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm23
7388; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm22, %zmm23
7389; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
7390; AVX512DQ-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
7391; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm24, %zmm23
7392; AVX512DQ-NEXT:    vmovdqa32 %zmm23, %zmm20 {%k1}
7393; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm3, %zmm22
7394; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm24, %zmm22
7395; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm23
7396; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm21, %zmm23
7397; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
7398; AVX512DQ-NEXT:    vmovdqa32 %zmm23, %zmm19 {%k2}
7399; AVX512DQ-NEXT:    vmovdqa32 %zmm22, %zmm19 {%k1}
7400; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
7401; AVX512DQ-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7402; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm25
7403; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
7404; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0]
7405; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm23
7406; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
7407; AVX512DQ-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
7408; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
7409; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
7410; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm26
7411; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm25, %zmm26
7412; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
7413; AVX512DQ-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7414; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7415; AVX512DQ-NEXT:    vmovdqa32 %zmm26, %zmm23 {%k1}
7416; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm6, %zmm25
7417; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm27, %zmm25
7418; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm4, %zmm24
7419; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm22
7420; AVX512DQ-NEXT:    vmovdqa32 %zmm24, %zmm22 {%k2}
7421; AVX512DQ-NEXT:    vmovdqa32 %zmm25, %zmm22 {%k1}
7422; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
7423; AVX512DQ-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
7424; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm25
7425; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm26, %zmm25
7426; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
7427; AVX512DQ-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7428; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm27, %zmm25
7429; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
7430; AVX512DQ-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7431; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm24
7432; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm28, %zmm24
7433; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25]
7434; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm30
7435; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm29, %zmm30
7436; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm30, %zmm24, %zmm24
7437; AVX512DQ-NEXT:    vmovdqa32 %zmm25, %zmm24 {%k1}
7438; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm6, %zmm26
7439; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm27, %zmm26
7440; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7441; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm29
7442; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm29, %zmm28, %zmm25
7443; AVX512DQ-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
7444; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm26
7445; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm18, %zmm26
7446; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
7447; AVX512DQ-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7448; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7449; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
7450; AVX512DQ-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7451; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm29
7452; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm28, %zmm29
7453; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26]
7454; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm31
7455; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm30, %zmm31
7456; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm31, %zmm29, %zmm29
7457; AVX512DQ-NEXT:    vmovdqa32 %zmm26, %zmm29 {%k1}
7458; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm3, %zmm18
7459; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm27, %zmm18
7460; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7461; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm30
7462; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm30, %zmm28, %zmm26
7463; AVX512DQ-NEXT:    vmovdqa32 %zmm18, %zmm26 {%k1}
7464; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm21, %zmm9
7465; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
7466; AVX512DQ-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
7467; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm15, %zmm9
7468; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
7469; AVX512DQ-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
7470; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm13, %zmm12
7471; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27]
7472; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm14, %zmm10
7473; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm10, %zmm12, %zmm10
7474; AVX512DQ-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
7475; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm21, %zmm3
7476; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
7477; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm13, %zmm2
7478; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm14, %zmm0
7479; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm0, %zmm2, %zmm0
7480; AVX512DQ-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
7481; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
7482; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rsi)
7483; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
7484; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rdx)
7485; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
7486; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rcx)
7487; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 64(%r8)
7488; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%r8)
7489; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 64(%r9)
7490; AVX512DQ-NEXT:    vmovdqa64 %zmm24, (%r9)
7491; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 64(%r10)
7492; AVX512DQ-NEXT:    vmovdqa64 %zmm29, (%r10)
7493; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rax)
7494; AVX512DQ-NEXT:    vmovdqa64 %zmm10, (%rax)
7495; AVX512DQ-NEXT:    vzeroupper
7496; AVX512DQ-NEXT:    retq
7497;
7498; AVX512DQ-FCP-LABEL: load_i32_stride7_vf32:
7499; AVX512DQ-FCP:       # %bb.0:
7500; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7501; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7502; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm1
7503; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
7504; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
7505; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm2
7506; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm5
7507; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm6
7508; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm3
7509; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm13
7510; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm15
7511; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
7512; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
7513; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
7514; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm14
7515; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm12
7516; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
7517; AVX512DQ-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
7518; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm17
7519; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
7520; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0]
7521; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm8
7522; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
7523; AVX512DQ-FCP-NEXT:    movw $992, %di # imm = 0x3E0
7524; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
7525; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm17, %zmm8 {%k1}
7526; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
7527; AVX512DQ-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7528; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm18
7529; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm17, %zmm18
7530; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
7531; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7532; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
7533; AVX512DQ-FCP-NEXT:    movb $-32, %dil
7534; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
7535; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm8 {%k2}
7536; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm17
7537; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm19, %zmm17
7538; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm16
7539; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
7540; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm16, %zmm7 {%k1}
7541; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm7 {%k2}
7542; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
7543; AVX512DQ-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7544; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm19
7545; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
7546; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0]
7547; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
7548; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
7549; AVX512DQ-FCP-NEXT:    movw $480, %di # imm = 0x1E0
7550; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
7551; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k2}
7552; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
7553; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7554; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm20
7555; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm19, %zmm20
7556; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
7557; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
7558; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm21, %zmm20
7559; AVX512DQ-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
7560; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
7561; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
7562; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm19
7563; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm19
7564; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm20
7565; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm20
7566; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
7567; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k2}
7568; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k1}
7569; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
7570; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7571; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22
7572; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
7573; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0]
7574; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm20
7575; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
7576; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
7577; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
7578; AVX512DQ-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
7579; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm23
7580; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm22, %zmm23
7581; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
7582; AVX512DQ-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
7583; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm24, %zmm23
7584; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm23, %zmm20 {%k1}
7585; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm22
7586; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm24, %zmm22
7587; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm23
7588; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm21, %zmm23
7589; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
7590; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm23, %zmm19 {%k2}
7591; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm22, %zmm19 {%k1}
7592; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
7593; AVX512DQ-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7594; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm25
7595; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
7596; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0]
7597; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm23
7598; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
7599; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
7600; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
7601; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
7602; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm26
7603; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm26
7604; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
7605; AVX512DQ-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7606; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7607; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm26, %zmm23 {%k1}
7608; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm6, %zmm25
7609; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm25
7610; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm24
7611; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm22
7612; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm24, %zmm22 {%k2}
7613; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm25, %zmm22 {%k1}
7614; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
7615; AVX512DQ-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
7616; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm25
7617; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm26, %zmm25
7618; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
7619; AVX512DQ-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7620; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm25
7621; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
7622; AVX512DQ-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7623; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm24
7624; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm28, %zmm24
7625; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25]
7626; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm30
7627; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm29, %zmm30
7628; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm24, %zmm24
7629; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm25, %zmm24 {%k1}
7630; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm6, %zmm26
7631; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm26
7632; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7633; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm29
7634; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm29, %zmm28, %zmm25
7635; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
7636; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm26
7637; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm26
7638; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
7639; AVX512DQ-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7640; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7641; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
7642; AVX512DQ-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7643; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm29
7644; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm28, %zmm29
7645; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26]
7646; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm31
7647; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm30, %zmm31
7648; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm31, %zmm29, %zmm29
7649; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm26, %zmm29 {%k1}
7650; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm18
7651; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm18
7652; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7653; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm30
7654; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm28, %zmm26
7655; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm18, %zmm26 {%k1}
7656; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm21, %zmm9
7657; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
7658; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
7659; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm15, %zmm9
7660; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
7661; AVX512DQ-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
7662; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm13, %zmm12
7663; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27]
7664; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm10
7665; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm12, %zmm10
7666; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
7667; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm21, %zmm3
7668; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
7669; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm13, %zmm2
7670; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm0
7671; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm0, %zmm2, %zmm0
7672; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
7673; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
7674; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
7675; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
7676; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
7677; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
7678; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
7679; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r8)
7680; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
7681; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r9)
7682; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, (%r9)
7683; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r10)
7684; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, (%r10)
7685; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
7686; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%rax)
7687; AVX512DQ-FCP-NEXT:    vzeroupper
7688; AVX512DQ-FCP-NEXT:    retq
7689;
7690; AVX512BW-LABEL: load_i32_stride7_vf32:
7691; AVX512BW:       # %bb.0:
7692; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7693; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7694; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm1
7695; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm0
7696; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm4
7697; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm2
7698; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm5
7699; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm6
7700; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm3
7701; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm13
7702; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm15
7703; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
7704; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm10
7705; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm11
7706; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm14
7707; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm12
7708; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
7709; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
7710; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm17
7711; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
7712; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0]
7713; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
7714; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
7715; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
7716; AVX512BW-NEXT:    kmovd %edi, %k1
7717; AVX512BW-NEXT:    vmovdqa32 %zmm17, %zmm8 {%k1}
7718; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
7719; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7720; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm18
7721; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm17, %zmm18
7722; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
7723; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7724; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
7725; AVX512BW-NEXT:    movb $-32, %dil
7726; AVX512BW-NEXT:    kmovd %edi, %k2
7727; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm8 {%k2}
7728; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm3, %zmm17
7729; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm19, %zmm17
7730; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm2, %zmm16
7731; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
7732; AVX512BW-NEXT:    vmovdqa32 %zmm16, %zmm7 {%k1}
7733; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm7 {%k2}
7734; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
7735; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7736; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm19
7737; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
7738; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0]
7739; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm17
7740; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
7741; AVX512BW-NEXT:    movw $480, %di # imm = 0x1E0
7742; AVX512BW-NEXT:    kmovd %edi, %k2
7743; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k2}
7744; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
7745; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7746; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm20
7747; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm19, %zmm20
7748; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
7749; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
7750; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm21, %zmm20
7751; AVX512BW-NEXT:    movw $-512, %di # imm = 0xFE00
7752; AVX512BW-NEXT:    kmovd %edi, %k1
7753; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
7754; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm3, %zmm19
7755; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm21, %zmm19
7756; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm20
7757; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm18, %zmm20
7758; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
7759; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k2}
7760; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k1}
7761; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
7762; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7763; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm22
7764; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
7765; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0]
7766; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm20
7767; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
7768; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
7769; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
7770; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
7771; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm23
7772; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm22, %zmm23
7773; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
7774; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
7775; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm24, %zmm23
7776; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm20 {%k1}
7777; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm3, %zmm22
7778; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm24, %zmm22
7779; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm23
7780; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm21, %zmm23
7781; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
7782; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm19 {%k2}
7783; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm19 {%k1}
7784; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
7785; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7786; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm25
7787; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
7788; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0]
7789; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm23
7790; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
7791; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
7792; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
7793; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
7794; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm26
7795; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm26
7796; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
7797; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7798; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7799; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm23 {%k1}
7800; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm6, %zmm25
7801; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm27, %zmm25
7802; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm4, %zmm24
7803; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm22
7804; AVX512BW-NEXT:    vmovdqa32 %zmm24, %zmm22 {%k2}
7805; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm22 {%k1}
7806; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
7807; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
7808; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm25
7809; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm26, %zmm25
7810; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
7811; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7812; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm27, %zmm25
7813; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
7814; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7815; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm24
7816; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm28, %zmm24
7817; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25]
7818; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm30
7819; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm29, %zmm30
7820; AVX512BW-NEXT:    vinserti32x4 $0, %xmm30, %zmm24, %zmm24
7821; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm24 {%k1}
7822; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm6, %zmm26
7823; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm27, %zmm26
7824; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7825; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm29
7826; AVX512BW-NEXT:    vinserti32x4 $0, %xmm29, %zmm28, %zmm25
7827; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
7828; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm26
7829; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm18, %zmm26
7830; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
7831; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7832; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7833; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
7834; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
7835; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm29
7836; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm28, %zmm29
7837; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26]
7838; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm31
7839; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm30, %zmm31
7840; AVX512BW-NEXT:    vinserti32x4 $0, %xmm31, %zmm29, %zmm29
7841; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm29 {%k1}
7842; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm3, %zmm18
7843; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm27, %zmm18
7844; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
7845; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm30
7846; AVX512BW-NEXT:    vinserti32x4 $0, %xmm30, %zmm28, %zmm26
7847; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm26 {%k1}
7848; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm21, %zmm9
7849; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
7850; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
7851; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm15, %zmm9
7852; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
7853; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
7854; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm13, %zmm12
7855; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27]
7856; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm14, %zmm10
7857; AVX512BW-NEXT:    vinserti32x4 $0, %xmm10, %zmm12, %zmm10
7858; AVX512BW-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
7859; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm21, %zmm3
7860; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
7861; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm13, %zmm2
7862; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm14, %zmm0
7863; AVX512BW-NEXT:    vinserti32x4 $0, %xmm0, %zmm2, %zmm0
7864; AVX512BW-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
7865; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
7866; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
7867; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
7868; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rdx)
7869; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
7870; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%rcx)
7871; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
7872; AVX512BW-NEXT:    vmovdqa64 %zmm23, (%r8)
7873; AVX512BW-NEXT:    vmovdqa64 %zmm25, 64(%r9)
7874; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%r9)
7875; AVX512BW-NEXT:    vmovdqa64 %zmm26, 64(%r10)
7876; AVX512BW-NEXT:    vmovdqa64 %zmm29, (%r10)
7877; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
7878; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rax)
7879; AVX512BW-NEXT:    vzeroupper
7880; AVX512BW-NEXT:    retq
7881;
7882; AVX512BW-FCP-LABEL: load_i32_stride7_vf32:
7883; AVX512BW-FCP:       # %bb.0:
7884; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7885; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7886; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm1
7887; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
7888; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
7889; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm2
7890; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm5
7891; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm6
7892; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm3
7893; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm13
7894; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm15
7895; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
7896; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
7897; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
7898; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm14
7899; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm12
7900; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
7901; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
7902; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm17
7903; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
7904; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0]
7905; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm8
7906; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
7907; AVX512BW-FCP-NEXT:    movw $992, %di # imm = 0x3E0
7908; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
7909; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm8 {%k1}
7910; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
7911; AVX512BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7912; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm18
7913; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm17, %zmm18
7914; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
7915; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7916; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
7917; AVX512BW-FCP-NEXT:    movb $-32, %dil
7918; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
7919; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm8 {%k2}
7920; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm17
7921; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm19, %zmm17
7922; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm16
7923; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
7924; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm7 {%k1}
7925; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm7 {%k2}
7926; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
7927; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7928; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm19
7929; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
7930; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0]
7931; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
7932; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
7933; AVX512BW-FCP-NEXT:    movw $480, %di # imm = 0x1E0
7934; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
7935; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k2}
7936; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
7937; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
7938; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm20
7939; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm19, %zmm20
7940; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
7941; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
7942; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm21, %zmm20
7943; AVX512BW-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
7944; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
7945; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
7946; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm19
7947; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm19
7948; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm20
7949; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm20
7950; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
7951; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k2}
7952; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k1}
7953; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
7954; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7955; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22
7956; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
7957; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0]
7958; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm20
7959; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
7960; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
7961; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
7962; AVX512BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
7963; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm23
7964; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm22, %zmm23
7965; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
7966; AVX512BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
7967; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm24, %zmm23
7968; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm20 {%k1}
7969; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm22
7970; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm24, %zmm22
7971; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm23
7972; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm21, %zmm23
7973; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
7974; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm19 {%k2}
7975; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm19 {%k1}
7976; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
7977; AVX512BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7978; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm25
7979; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
7980; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0]
7981; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm23
7982; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
7983; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
7984; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
7985; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
7986; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm26
7987; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm26
7988; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
7989; AVX512BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
7990; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
7991; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm23 {%k1}
7992; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm6, %zmm25
7993; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm25
7994; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm24
7995; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm22
7996; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm24, %zmm22 {%k2}
7997; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm22 {%k1}
7998; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
7999; AVX512BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
8000; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm25
8001; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm26, %zmm25
8002; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
8003; AVX512BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
8004; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm25
8005; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
8006; AVX512BW-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
8007; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm24
8008; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm28, %zmm24
8009; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25]
8010; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm30
8011; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm29, %zmm30
8012; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm24, %zmm24
8013; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm24 {%k1}
8014; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm6, %zmm26
8015; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm26
8016; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
8017; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm29
8018; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm29, %zmm28, %zmm25
8019; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
8020; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm26
8021; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm26
8022; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
8023; AVX512BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
8024; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
8025; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
8026; AVX512BW-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
8027; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm29
8028; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm28, %zmm29
8029; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26]
8030; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm31
8031; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm30, %zmm31
8032; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm31, %zmm29, %zmm29
8033; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm29 {%k1}
8034; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm18
8035; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm18
8036; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
8037; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm30
8038; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm28, %zmm26
8039; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm26 {%k1}
8040; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm21, %zmm9
8041; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
8042; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
8043; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm15, %zmm9
8044; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
8045; AVX512BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
8046; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm13, %zmm12
8047; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27]
8048; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm10
8049; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm12, %zmm10
8050; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
8051; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm21, %zmm3
8052; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
8053; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm13, %zmm2
8054; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm0
8055; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm0, %zmm2, %zmm0
8056; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
8057; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
8058; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
8059; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
8060; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
8061; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
8062; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
8063; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r8)
8064; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
8065; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r9)
8066; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, (%r9)
8067; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r10)
8068; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, (%r10)
8069; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
8070; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rax)
8071; AVX512BW-FCP-NEXT:    vzeroupper
8072; AVX512BW-FCP-NEXT:    retq
8073;
8074; AVX512DQ-BW-LABEL: load_i32_stride7_vf32:
8075; AVX512DQ-BW:       # %bb.0:
8076; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8077; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
8078; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm1
8079; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm0
8080; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm4
8081; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm2
8082; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm5
8083; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm6
8084; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm3
8085; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm13
8086; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm15
8087; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
8088; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm10
8089; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm11
8090; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm14
8091; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm12
8092; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
8093; AVX512DQ-BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
8094; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm17
8095; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
8096; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0]
8097; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm8
8098; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
8099; AVX512DQ-BW-NEXT:    movw $992, %di # imm = 0x3E0
8100; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
8101; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm17, %zmm8 {%k1}
8102; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
8103; AVX512DQ-BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8104; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm18
8105; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm17, %zmm18
8106; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
8107; AVX512DQ-BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
8108; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
8109; AVX512DQ-BW-NEXT:    movb $-32, %dil
8110; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
8111; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm8 {%k2}
8112; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm3, %zmm17
8113; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm19, %zmm17
8114; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm2, %zmm16
8115; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
8116; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm16, %zmm7 {%k1}
8117; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm7 {%k2}
8118; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
8119; AVX512DQ-BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8120; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm19
8121; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
8122; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0]
8123; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm17
8124; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
8125; AVX512DQ-BW-NEXT:    movw $480, %di # imm = 0x1E0
8126; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
8127; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k2}
8128; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
8129; AVX512DQ-BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
8130; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm20
8131; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm19, %zmm20
8132; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
8133; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
8134; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm21, %zmm20
8135; AVX512DQ-BW-NEXT:    movw $-512, %di # imm = 0xFE00
8136; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
8137; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
8138; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm3, %zmm19
8139; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm21, %zmm19
8140; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm20
8141; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm18, %zmm20
8142; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
8143; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k2}
8144; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k1}
8145; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
8146; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8147; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm22
8148; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
8149; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0]
8150; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm20
8151; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
8152; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
8153; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
8154; AVX512DQ-BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
8155; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm23
8156; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm22, %zmm23
8157; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
8158; AVX512DQ-BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
8159; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm24, %zmm23
8160; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm23, %zmm20 {%k1}
8161; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm3, %zmm22
8162; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm24, %zmm22
8163; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm23
8164; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm21, %zmm23
8165; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
8166; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm23, %zmm19 {%k2}
8167; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm19 {%k1}
8168; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
8169; AVX512DQ-BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8170; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm25
8171; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
8172; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0]
8173; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm23
8174; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
8175; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
8176; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
8177; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
8178; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm26
8179; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm26
8180; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
8181; AVX512DQ-BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
8182; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
8183; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm26, %zmm23 {%k1}
8184; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm6, %zmm25
8185; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm27, %zmm25
8186; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm4, %zmm24
8187; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm22
8188; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm24, %zmm22 {%k2}
8189; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm22 {%k1}
8190; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
8191; AVX512DQ-BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
8192; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm25
8193; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm26, %zmm25
8194; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
8195; AVX512DQ-BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
8196; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm27, %zmm25
8197; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
8198; AVX512DQ-BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
8199; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm24
8200; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm28, %zmm24
8201; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25]
8202; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm30
8203; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm29, %zmm30
8204; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm30, %zmm24, %zmm24
8205; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm24 {%k1}
8206; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm6, %zmm26
8207; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm27, %zmm26
8208; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
8209; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm29
8210; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm29, %zmm28, %zmm25
8211; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
8212; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm26
8213; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm18, %zmm26
8214; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
8215; AVX512DQ-BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
8216; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
8217; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
8218; AVX512DQ-BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
8219; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm29
8220; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm28, %zmm29
8221; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26]
8222; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm31
8223; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm30, %zmm31
8224; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm31, %zmm29, %zmm29
8225; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm26, %zmm29 {%k1}
8226; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm3, %zmm18
8227; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm27, %zmm18
8228; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
8229; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm30
8230; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm30, %zmm28, %zmm26
8231; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm26 {%k1}
8232; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm21, %zmm9
8233; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
8234; AVX512DQ-BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
8235; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm15, %zmm9
8236; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
8237; AVX512DQ-BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
8238; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm13, %zmm12
8239; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27]
8240; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm14, %zmm10
8241; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm10, %zmm12, %zmm10
8242; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
8243; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm21, %zmm3
8244; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
8245; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm13, %zmm2
8246; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm14, %zmm0
8247; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm0, %zmm2, %zmm0
8248; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
8249; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
8250; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
8251; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
8252; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, (%rdx)
8253; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
8254; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, (%rcx)
8255; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
8256; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, (%r8)
8257; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, 64(%r9)
8258; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, (%r9)
8259; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, 64(%r10)
8260; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, (%r10)
8261; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
8262; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%rax)
8263; AVX512DQ-BW-NEXT:    vzeroupper
8264; AVX512DQ-BW-NEXT:    retq
8265;
8266; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf32:
8267; AVX512DQ-BW-FCP:       # %bb.0:
8268; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8269; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
8270; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm1
8271; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
8272; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
8273; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm2
8274; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm5
8275; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm6
8276; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm3
8277; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm13
8278; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm15
8279; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
8280; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
8281; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
8282; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm14
8283; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm12
8284; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
8285; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
8286; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm17
8287; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
8288; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0]
8289; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm8
8290; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
8291; AVX512DQ-BW-FCP-NEXT:    movw $992, %di # imm = 0x3E0
8292; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
8293; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm8 {%k1}
8294; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
8295; AVX512DQ-BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8296; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm18
8297; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm17, %zmm18
8298; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
8299; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
8300; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
8301; AVX512DQ-BW-FCP-NEXT:    movb $-32, %dil
8302; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
8303; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm8 {%k2}
8304; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm17
8305; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm19, %zmm17
8306; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm16
8307; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
8308; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm7 {%k1}
8309; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm7 {%k2}
8310; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
8311; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8312; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm19
8313; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
8314; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0]
8315; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
8316; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
8317; AVX512DQ-BW-FCP-NEXT:    movw $480, %di # imm = 0x1E0
8318; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
8319; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k2}
8320; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
8321; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
8322; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm20
8323; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm19, %zmm20
8324; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
8325; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
8326; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm21, %zmm20
8327; AVX512DQ-BW-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
8328; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
8329; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
8330; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm19
8331; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm19
8332; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm20
8333; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm20
8334; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
8335; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm16 {%k2}
8336; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm16 {%k1}
8337; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
8338; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8339; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22
8340; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
8341; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0]
8342; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm20
8343; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
8344; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
8345; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
8346; AVX512DQ-BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
8347; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm23
8348; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm22, %zmm23
8349; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
8350; AVX512DQ-BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
8351; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm24, %zmm23
8352; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm20 {%k1}
8353; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm22
8354; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm24, %zmm22
8355; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm23
8356; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm21, %zmm23
8357; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
8358; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm19 {%k2}
8359; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm19 {%k1}
8360; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
8361; AVX512DQ-BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8362; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm25
8363; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
8364; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0]
8365; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm23
8366; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
8367; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
8368; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
8369; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
8370; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm26
8371; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm26
8372; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
8373; AVX512DQ-BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
8374; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
8375; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm23 {%k1}
8376; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm6, %zmm25
8377; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm25
8378; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm24
8379; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm22
8380; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm24, %zmm22 {%k2}
8381; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm22 {%k1}
8382; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
8383; AVX512DQ-BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
8384; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm25
8385; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm26, %zmm25
8386; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
8387; AVX512DQ-BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
8388; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm25
8389; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
8390; AVX512DQ-BW-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
8391; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm24
8392; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm28, %zmm24
8393; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25]
8394; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm30
8395; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm29, %zmm30
8396; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm24, %zmm24
8397; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm24 {%k1}
8398; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm6, %zmm26
8399; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm26
8400; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
8401; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm29
8402; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm29, %zmm28, %zmm25
8403; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
8404; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm26
8405; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm26
8406; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
8407; AVX512DQ-BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
8408; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm27, %zmm26
8409; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
8410; AVX512DQ-BW-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
8411; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm29
8412; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm28, %zmm29
8413; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26]
8414; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm31
8415; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm30, %zmm31
8416; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm31, %zmm29, %zmm29
8417; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm29 {%k1}
8418; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm3, %zmm18
8419; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm27, %zmm18
8420; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm4, %zmm28
8421; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm30
8422; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm28, %zmm26
8423; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm26 {%k1}
8424; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm21, %zmm9
8425; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
8426; AVX512DQ-BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
8427; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm15, %zmm9
8428; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
8429; AVX512DQ-BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
8430; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm13, %zmm12
8431; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27]
8432; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm10
8433; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm12, %zmm10
8434; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
8435; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm21, %zmm3
8436; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
8437; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm13, %zmm2
8438; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm0
8439; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm0, %zmm2, %zmm0
8440; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
8441; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
8442; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
8443; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
8444; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
8445; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
8446; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
8447; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r8)
8448; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
8449; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r9)
8450; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, (%r9)
8451; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r10)
8452; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, (%r10)
8453; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
8454; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rax)
8455; AVX512DQ-BW-FCP-NEXT:    vzeroupper
8456; AVX512DQ-BW-FCP-NEXT:    retq
8457  %wide.vec = load <224 x i32>, ptr %in.vec, align 64
8458  %strided.vec0 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
8459  %strided.vec1 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
8460  %strided.vec2 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219>
8461  %strided.vec3 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220>
8462  %strided.vec4 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221>
8463  %strided.vec5 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222>
8464  %strided.vec6 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223>
8465  store <32 x i32> %strided.vec0, ptr %out.vec0, align 64
8466  store <32 x i32> %strided.vec1, ptr %out.vec1, align 64
8467  store <32 x i32> %strided.vec2, ptr %out.vec2, align 64
8468  store <32 x i32> %strided.vec3, ptr %out.vec3, align 64
8469  store <32 x i32> %strided.vec4, ptr %out.vec4, align 64
8470  store <32 x i32> %strided.vec5, ptr %out.vec5, align 64
8471  store <32 x i32> %strided.vec6, ptr %out.vec6, align 64
8472  ret void
8473}
8474
8475define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
8476; SSE-LABEL: load_i32_stride7_vf64:
8477; SSE:       # %bb.0:
8478; SSE-NEXT:    subq $2456, %rsp # imm = 0x998
8479; SSE-NEXT:    movdqa 1088(%rdi), %xmm3
8480; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8481; SSE-NEXT:    movdqa 1056(%rdi), %xmm4
8482; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8483; SSE-NEXT:    movdqa 1008(%rdi), %xmm9
8484; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8485; SSE-NEXT:    movdqa 1024(%rdi), %xmm5
8486; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8487; SSE-NEXT:    movdqa 640(%rdi), %xmm13
8488; SSE-NEXT:    movdqa 608(%rdi), %xmm6
8489; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8490; SSE-NEXT:    movdqa 560(%rdi), %xmm10
8491; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8492; SSE-NEXT:    movdqa 576(%rdi), %xmm7
8493; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8494; SSE-NEXT:    movdqa 192(%rdi), %xmm2
8495; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8496; SSE-NEXT:    movdqa 160(%rdi), %xmm15
8497; SSE-NEXT:    movdqa 112(%rdi), %xmm1
8498; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8499; SSE-NEXT:    movdqa 128(%rdi), %xmm0
8500; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8501; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
8502; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8503; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
8504; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8505; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8506; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8507; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8508; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
8509; SSE-NEXT:    movdqa %xmm10, %xmm1
8510; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8511; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
8512; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
8513; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8514; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8515; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8516; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
8517; SSE-NEXT:    movdqa %xmm9, %xmm1
8518; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8519; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3]
8520; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
8521; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8522; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8523; SSE-NEXT:    movdqa 1456(%rdi), %xmm1
8524; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8525; SSE-NEXT:    movdqa 1472(%rdi), %xmm0
8526; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8527; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
8528; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8529; SSE-NEXT:    movdqa 1536(%rdi), %xmm2
8530; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8531; SSE-NEXT:    movdqa 1504(%rdi), %xmm0
8532; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8533; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8534; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8535; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8536; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8537; SSE-NEXT:    movdqa (%rdi), %xmm1
8538; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8539; SSE-NEXT:    movdqa 16(%rdi), %xmm0
8540; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8541; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
8542; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8543; SSE-NEXT:    movdqa 80(%rdi), %xmm2
8544; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8545; SSE-NEXT:    movdqa 48(%rdi), %xmm0
8546; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8547; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8548; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8549; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8550; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8551; SSE-NEXT:    movdqa 448(%rdi), %xmm1
8552; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8553; SSE-NEXT:    movdqa 464(%rdi), %xmm0
8554; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8555; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
8556; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8557; SSE-NEXT:    movdqa 528(%rdi), %xmm2
8558; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8559; SSE-NEXT:    movdqa 496(%rdi), %xmm0
8560; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8561; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8562; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8563; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8564; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8565; SSE-NEXT:    movdqa 896(%rdi), %xmm1
8566; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8567; SSE-NEXT:    movdqa 912(%rdi), %xmm0
8568; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8569; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
8570; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8571; SSE-NEXT:    movdqa 976(%rdi), %xmm2
8572; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8573; SSE-NEXT:    movdqa 944(%rdi), %xmm0
8574; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8575; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8576; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8577; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8578; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8579; SSE-NEXT:    movdqa 1344(%rdi), %xmm1
8580; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8581; SSE-NEXT:    movdqa 1360(%rdi), %xmm0
8582; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8583; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
8584; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8585; SSE-NEXT:    movdqa 1424(%rdi), %xmm2
8586; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8587; SSE-NEXT:    movdqa 1392(%rdi), %xmm0
8588; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8589; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8590; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8591; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8592; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8593; SSE-NEXT:    movdqa 336(%rdi), %xmm12
8594; SSE-NEXT:    movdqa 352(%rdi), %xmm0
8595; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8596; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
8597; SSE-NEXT:    movdqa %xmm12, %xmm5
8598; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8599; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
8600; SSE-NEXT:    movdqa 416(%rdi), %xmm4
8601; SSE-NEXT:    movdqa 384(%rdi), %xmm10
8602; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
8603; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8604; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
8605; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8606; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
8607; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8608; SSE-NEXT:    movdqa 784(%rdi), %xmm6
8609; SSE-NEXT:    movdqa %xmm6, (%rsp) # 16-byte Spill
8610; SSE-NEXT:    movdqa 800(%rdi), %xmm0
8611; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8612; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
8613; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
8614; SSE-NEXT:    movdqa 864(%rdi), %xmm0
8615; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8616; SSE-NEXT:    movdqa 832(%rdi), %xmm1
8617; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8618; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3]
8619; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
8620; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
8621; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8622; SSE-NEXT:    movdqa 1232(%rdi), %xmm6
8623; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8624; SSE-NEXT:    movdqa 1248(%rdi), %xmm0
8625; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8626; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
8627; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
8628; SSE-NEXT:    movdqa 1312(%rdi), %xmm0
8629; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8630; SSE-NEXT:    movdqa 1280(%rdi), %xmm1
8631; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8632; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3]
8633; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
8634; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
8635; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8636; SSE-NEXT:    movdqa 1680(%rdi), %xmm6
8637; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8638; SSE-NEXT:    movdqa 1696(%rdi), %xmm0
8639; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8640; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
8641; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
8642; SSE-NEXT:    movdqa 1760(%rdi), %xmm1
8643; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8644; SSE-NEXT:    movdqa 1728(%rdi), %xmm0
8645; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8646; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3]
8647; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
8648; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
8649; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8650; SSE-NEXT:    movdqa 224(%rdi), %xmm8
8651; SSE-NEXT:    movdqa 240(%rdi), %xmm0
8652; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8653; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
8654; SSE-NEXT:    movdqa %xmm8, %xmm6
8655; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8656; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
8657; SSE-NEXT:    movdqa 304(%rdi), %xmm2
8658; SSE-NEXT:    movdqa 272(%rdi), %xmm3
8659; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
8660; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8661; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
8662; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8663; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
8664; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8665; SSE-NEXT:    movdqa 672(%rdi), %xmm11
8666; SSE-NEXT:    movdqa 688(%rdi), %xmm0
8667; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8668; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
8669; SSE-NEXT:    movdqa %xmm11, %xmm7
8670; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
8671; SSE-NEXT:    movdqa 752(%rdi), %xmm14
8672; SSE-NEXT:    movdqa 720(%rdi), %xmm1
8673; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
8674; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8675; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
8676; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8677; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8678; SSE-NEXT:    movdqa 1120(%rdi), %xmm9
8679; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8680; SSE-NEXT:    movdqa 1136(%rdi), %xmm0
8681; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8682; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
8683; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
8684; SSE-NEXT:    movdqa 1200(%rdi), %xmm5
8685; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8686; SSE-NEXT:    movdqa 1168(%rdi), %xmm6
8687; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
8688; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8689; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
8690; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1]
8691; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8692; SSE-NEXT:    movdqa 1568(%rdi), %xmm9
8693; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8694; SSE-NEXT:    movdqa 1584(%rdi), %xmm0
8695; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8696; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
8697; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
8698; SSE-NEXT:    movdqa 1648(%rdi), %xmm5
8699; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8700; SSE-NEXT:    movdqa 1616(%rdi), %xmm0
8701; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8702; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8703; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
8704; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1]
8705; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8706; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8707; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,2,2,2]
8708; SSE-NEXT:    punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
8709; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8710; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8711; SSE-NEXT:    movdqa 144(%rdi), %xmm0
8712; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8713; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
8714; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1]
8715; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8716; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8717; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2]
8718; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8719; SSE-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
8720; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8721; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
8722; SSE-NEXT:    movdqa 32(%rdi), %xmm5
8723; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8724; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
8725; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1]
8726; SSE-NEXT:    movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8727; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2]
8728; SSE-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3]
8729; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1]
8730; SSE-NEXT:    movdqa 368(%rdi), %xmm4
8731; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8732; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
8733; SSE-NEXT:    movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1]
8734; SSE-NEXT:    movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8735; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2]
8736; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
8737; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1]
8738; SSE-NEXT:    movdqa 256(%rdi), %xmm2
8739; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8740; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
8741; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1]
8742; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8743; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2]
8744; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8745; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3]
8746; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8747; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8748; SSE-NEXT:    movdqa 592(%rdi), %xmm2
8749; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8750; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
8751; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1]
8752; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8753; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8754; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2]
8755; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8756; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8757; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8758; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1]
8759; SSE-NEXT:    movdqa 480(%rdi), %xmm5
8760; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8761; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
8762; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8763; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8764; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8765; SSE-NEXT:    # xmm7 = mem[2,2,2,2]
8766; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8767; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8768; SSE-NEXT:    movdqa (%rsp), %xmm13 # 16-byte Reload
8769; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1]
8770; SSE-NEXT:    movdqa 816(%rdi), %xmm5
8771; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8772; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
8773; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8774; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8775; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8776; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2]
8777; SSE-NEXT:    movdqa %xmm1, %xmm2
8778; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8779; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1]
8780; SSE-NEXT:    movdqa %xmm11, %xmm12
8781; SSE-NEXT:    movdqa 704(%rdi), %xmm1
8782; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8783; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
8784; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8785; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8786; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8787; SSE-NEXT:    # xmm7 = mem[2,2,2,2]
8788; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8789; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
8790; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8791; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8792; SSE-NEXT:    movdqa 1040(%rdi), %xmm2
8793; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8794; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
8795; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
8796; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8797; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8798; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2]
8799; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8800; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
8801; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8802; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8803; SSE-NEXT:    movdqa 928(%rdi), %xmm2
8804; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8805; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
8806; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
8807; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8808; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8809; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,2,2,2]
8810; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8811; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8812; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8813; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8814; SSE-NEXT:    movdqa 1264(%rdi), %xmm1
8815; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8816; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
8817; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8818; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8819; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8820; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2]
8821; SSE-NEXT:    movdqa %xmm6, %xmm2
8822; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8823; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8824; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1]
8825; SSE-NEXT:    movdqa 1152(%rdi), %xmm6
8826; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8827; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
8828; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8829; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8830; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8831; SSE-NEXT:    # xmm7 = mem[2,2,2,2]
8832; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8833; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8834; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8835; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8836; SSE-NEXT:    movdqa 1488(%rdi), %xmm6
8837; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8838; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
8839; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8840; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8841; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8842; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm10[2,2,2,2]
8843; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8844; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8845; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8846; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8847; SSE-NEXT:    movdqa 1376(%rdi), %xmm6
8848; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8849; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
8850; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8851; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8852; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8853; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm11[2,2,2,2]
8854; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8855; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8856; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8857; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8858; SSE-NEXT:    movdqa 1712(%rdi), %xmm6
8859; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8860; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
8861; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8862; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8863; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8864; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,2,2,2]
8865; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8866; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
8867; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8868; SSE-NEXT:    # xmm7 = mem[1,1,1,1]
8869; SSE-NEXT:    movdqa 1600(%rdi), %xmm9
8870; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8871; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8872; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8873; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8874; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
8875; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8876; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
8877; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8878; SSE-NEXT:    movdqa 64(%rdi), %xmm0
8879; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8880; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3]
8881; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
8882; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
8883; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1]
8884; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8885; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8886; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
8887; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8888; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
8889; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8890; SSE-NEXT:    movdqa 176(%rdi), %xmm0
8891; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8892; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8893; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
8894; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8895; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8896; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8897; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8898; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8899; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
8900; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8901; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
8902; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8903; SSE-NEXT:    movdqa 288(%rdi), %xmm0
8904; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8905; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8906; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
8907; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8908; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8909; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8910; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8911; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8912; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
8913; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8914; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
8915; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8916; SSE-NEXT:    movdqa 400(%rdi), %xmm0
8917; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8918; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8919; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
8920; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8921; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8922; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8923; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8924; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
8925; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8926; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
8927; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8928; SSE-NEXT:    movdqa 512(%rdi), %xmm0
8929; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8930; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[2,3,2,3]
8931; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8932; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8933; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8934; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8935; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8936; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
8937; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8938; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
8939; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8940; SSE-NEXT:    movdqa 624(%rdi), %xmm0
8941; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8942; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8943; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
8944; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8945; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8946; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8947; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8948; SSE-NEXT:    movdqa %xmm12, %xmm15
8949; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
8950; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8951; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
8952; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8953; SSE-NEXT:    movdqa 736(%rdi), %xmm0
8954; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8955; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3]
8956; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8957; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8958; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8959; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8960; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3]
8961; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8962; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1]
8963; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8964; SSE-NEXT:    movdqa 848(%rdi), %xmm0
8965; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8966; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8967; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
8968; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8969; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8970; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8971; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8972; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8973; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
8974; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8975; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
8976; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8977; SSE-NEXT:    movdqa 960(%rdi), %xmm0
8978; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8979; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3]
8980; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8981; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8982; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8983; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8984; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8985; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
8986; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8987; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1]
8988; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
8989; SSE-NEXT:    movdqa 1072(%rdi), %xmm0
8990; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8991; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8992; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
8993; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
8994; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8995; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
8996; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8997; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
8998; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8999; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1]
9000; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9001; SSE-NEXT:    movdqa 1184(%rdi), %xmm0
9002; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9003; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3]
9004; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9005; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9006; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
9007; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9008; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9009; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3]
9010; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9011; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
9012; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9013; SSE-NEXT:    movdqa 1296(%rdi), %xmm0
9014; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9015; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9016; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
9017; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9018; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9019; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
9020; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9021; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9022; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
9023; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9024; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
9025; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9026; SSE-NEXT:    movdqa 1408(%rdi), %xmm0
9027; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9028; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3]
9029; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9030; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9031; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
9032; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9033; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9034; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
9035; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9036; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
9037; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9038; SSE-NEXT:    movdqa 1520(%rdi), %xmm0
9039; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9040; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9041; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
9042; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9043; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9044; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
9045; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9046; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9047; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
9048; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9049; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
9050; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9051; SSE-NEXT:    movdqa 1632(%rdi), %xmm0
9052; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9053; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3]
9054; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9055; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9056; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
9057; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9058; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9059; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
9060; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9061; SSE-NEXT:    # xmm9 = mem[1,1,1,1]
9062; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9063; SSE-NEXT:    movdqa 1744(%rdi), %xmm0
9064; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9065; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm11[2,3,2,3]
9066; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9067; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9068; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
9069; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9070; SSE-NEXT:    movdqa 96(%rdi), %xmm0
9071; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9072; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1]
9073; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9074; SSE-NEXT:    movdqa %xmm5, %xmm0
9075; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
9076; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9077; SSE-NEXT:    # xmm4 = mem[2,2,3,3]
9078; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9079; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3]
9080; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
9081; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9082; SSE-NEXT:    movdqa 208(%rdi), %xmm0
9083; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9084; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1]
9085; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9086; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
9087; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
9088; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9089; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9090; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
9091; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9092; SSE-NEXT:    movdqa 320(%rdi), %xmm0
9093; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9094; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
9095; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9096; SSE-NEXT:    movdqa %xmm13, %xmm0
9097; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9098; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9099; SSE-NEXT:    # xmm2 = mem[2,2,3,3]
9100; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9101; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9102; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
9103; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9104; SSE-NEXT:    movdqa 432(%rdi), %xmm0
9105; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9106; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
9107; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9108; SSE-NEXT:    movdqa %xmm11, %xmm0
9109; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9110; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9111; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
9112; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9113; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3]
9114; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9115; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9116; SSE-NEXT:    movdqa 544(%rdi), %xmm0
9117; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9118; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
9119; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9120; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9121; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9122; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
9123; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9124; SSE-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9125; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9126; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9127; SSE-NEXT:    movdqa 656(%rdi), %xmm0
9128; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9129; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
9130; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9131; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9132; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9133; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
9134; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9135; SSE-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9136; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9137; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9138; SSE-NEXT:    movdqa 768(%rdi), %xmm0
9139; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9140; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
9141; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9142; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9143; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3]
9144; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9145; SSE-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9146; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9147; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9148; SSE-NEXT:    movdqa 880(%rdi), %xmm0
9149; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9150; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
9151; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9152; SSE-NEXT:    movdqa %xmm15, %xmm2
9153; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9154; SSE-NEXT:    pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload
9155; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9156; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
9157; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9158; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9159; SSE-NEXT:    movdqa 992(%rdi), %xmm0
9160; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9161; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9162; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9163; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9164; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9165; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9166; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9167; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9168; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9169; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9170; SSE-NEXT:    movdqa 1104(%rdi), %xmm0
9171; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
9172; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9173; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9174; SSE-NEXT:    movdqa %xmm12, %xmm1
9175; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9176; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9177; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9178; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
9179; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9180; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9181; SSE-NEXT:    movdqa 1216(%rdi), %xmm0
9182; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9183; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9184; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9185; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9186; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9187; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9188; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
9189; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9190; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9191; SSE-NEXT:    movdqa 1328(%rdi), %xmm0
9192; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9193; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9194; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9195; SSE-NEXT:    movdqa %xmm14, %xmm1
9196; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9197; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
9198; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9199; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9200; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9201; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9202; SSE-NEXT:    movdqa 1440(%rdi), %xmm0
9203; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9204; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9205; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9206; SSE-NEXT:    movdqa %xmm9, %xmm1
9207; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9208; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9209; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9210; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9211; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9212; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9213; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9214; SSE-NEXT:    movdqa 1552(%rdi), %xmm0
9215; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9216; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9217; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9218; SSE-NEXT:    movdqa %xmm6, %xmm1
9219; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9220; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9221; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9222; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9223; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9224; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9225; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9226; SSE-NEXT:    movdqa 1664(%rdi), %xmm0
9227; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9228; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9229; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9230; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9231; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9232; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9233; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9234; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
9235; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9236; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9237; SSE-NEXT:    movdqa 1776(%rdi), %xmm0
9238; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9239; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
9240; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9241; SSE-NEXT:    movdqa %xmm4, %xmm1
9242; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9243; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9244; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9245; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9246; SSE-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9247; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9248; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9249; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
9250; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9251; SSE-NEXT:    movdqa %xmm7, %xmm1
9252; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9253; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
9254; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9255; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9256; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9257; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9258; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9259; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9260; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9261; SSE-NEXT:    movdqa %xmm5, %xmm1
9262; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9263; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9264; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9265; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9266; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9267; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9268; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9269; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9270; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9271; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9272; SSE-NEXT:    movdqa %xmm8, %xmm1
9273; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9274; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
9275; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9276; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9277; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9278; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9279; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3]
9280; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9281; SSE-NEXT:    movdqa %xmm10, %xmm1
9282; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9283; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
9284; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9285; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
9286; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9287; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9288; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9289; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9290; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9291; SSE-NEXT:    movdqa %xmm11, %xmm1
9292; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9293; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9294; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9295; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9296; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9297; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9298; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9299; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9300; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9301; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9302; SSE-NEXT:    movdqa %xmm13, %xmm1
9303; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9304; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9305; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9306; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9307; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9308; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9309; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9310; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9311; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9312; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9313; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9314; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9315; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9316; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9317; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9318; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9319; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9320; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9321; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9322; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9323; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9324; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
9325; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9326; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9327; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9328; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9329; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9330; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9331; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9332; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9333; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9334; SSE-NEXT:    # xmm15 = mem[2,2,3,3]
9335; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9336; SSE-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
9337; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
9338; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9339; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9340; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9341; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9342; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9343; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
9344; SSE-NEXT:    movdqa (%rsp), %xmm15 # 16-byte Reload
9345; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
9346; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9347; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9348; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9349; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9350; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9351; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9352; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9353; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9354; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9355; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
9356; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9357; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9358; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9359; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9360; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9361; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9362; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3]
9363; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9364; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9365; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9366; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9367; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9368; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9369; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9370; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9371; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
9372; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9373; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9374; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9375; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9376; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9377; SSE-NEXT:    # xmm0 = mem[3,3,3,3]
9378; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9379; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9380; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
9381; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9382; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
9383; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9384; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9385; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
9386; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9387; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9388; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9389; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
9390; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9391; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9392; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9393; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9394; SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9395; SSE-NEXT:    # xmm2 = mem[3,3,3,3]
9396; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9397; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9398; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3]
9399; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9400; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9401; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9402; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9403; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9404; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9405; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9406; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9407; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
9408; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9409; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9410; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9411; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9412; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9413; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9414; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9415; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9416; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
9417; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9418; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9419; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9420; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9421; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9422; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9423; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9424; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9425; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
9426; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9427; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9428; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9429; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9430; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
9431; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9432; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9433; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
9434; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9435; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9436; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9437; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9438; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9439; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
9440; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9441; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9442; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
9443; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9444; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
9445; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9446; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9447; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9448; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2]
9449; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9450; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9451; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
9452; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9453; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
9454; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9455; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9456; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9457; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9458; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9459; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9460; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9461; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9462; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9463; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9464; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9465; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9466; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9467; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9468; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9469; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9470; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9471; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9472; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9473; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
9474; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9475; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9476; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9477; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9478; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9479; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9480; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9481; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9482; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9483; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9484; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9485; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9486; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
9487; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9488; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9489; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9490; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9491; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9492; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
9493; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9494; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9495; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
9496; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9497; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9498; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9499; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9500; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9501; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9502; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9503; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9504; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9505; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9506; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9507; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9508; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9509; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9510; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9511; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9512; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9513; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9514; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2]
9515; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9516; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9517; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9518; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9519; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9520; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9521; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9522; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9523; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
9524; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9525; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9526; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9527; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9528; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9529; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
9530; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9531; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9532; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9533; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9534; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9535; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
9536; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9537; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9538; SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9539; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
9540; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9541; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9542; SSE-NEXT:    pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9543; SSE-NEXT:    # xmm0 = mem[2,2,2,2]
9544; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
9545; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9546; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9547; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9548; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9549; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
9550; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9551; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9552; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9553; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9554; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9555; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9556; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9557; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9558; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9559; SSE-NEXT:    # xmm4 = mem[0,0,1,1]
9560; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
9561; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
9562; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9563; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9564; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9565; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9566; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9567; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9568; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9569; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9570; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9571; SSE-NEXT:    # xmm4 = mem[0,0,1,1]
9572; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
9573; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
9574; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9575; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9576; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9577; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9578; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9579; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9580; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9581; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9582; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9583; SSE-NEXT:    # xmm15 = mem[0,0,1,1]
9584; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
9585; SSE-NEXT:    movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
9586; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9587; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9588; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9589; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9590; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9591; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9592; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9593; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9594; SSE-NEXT:    # xmm14 = mem[0,0,1,1]
9595; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
9596; SSE-NEXT:    movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1]
9597; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
9598; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9599; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9600; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9601; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
9602; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9603; SSE-NEXT:    # xmm13 = mem[0,0,1,1]
9604; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
9605; SSE-NEXT:    movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
9606; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
9607; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9608; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9609; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9610; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3]
9611; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9612; SSE-NEXT:    # xmm12 = mem[0,0,1,1]
9613; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
9614; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1]
9615; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9616; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9617; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9618; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9619; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9620; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9621; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9622; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9623; SSE-NEXT:    # xmm11 = mem[0,0,1,1]
9624; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
9625; SSE-NEXT:    movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1]
9626; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
9627; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9628; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9629; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9630; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9631; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9632; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9633; SSE-NEXT:    # xmm10 = mem[0,0,1,1]
9634; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
9635; SSE-NEXT:    movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
9636; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9637; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9638; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9639; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9640; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9641; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9642; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9643; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9644; SSE-NEXT:    # xmm9 = mem[0,0,1,1]
9645; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
9646; SSE-NEXT:    movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
9647; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
9648; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9649; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9650; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9651; SSE-NEXT:    pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
9652; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9653; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9654; SSE-NEXT:    # xmm8 = mem[0,0,1,1]
9655; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
9656; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
9657; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9658; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9659; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9660; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9661; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9662; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9663; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9664; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9665; SSE-NEXT:    # xmm7 = mem[0,0,1,1]
9666; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
9667; SSE-NEXT:    movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
9668; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9669; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9670; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9671; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9672; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9673; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9674; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9675; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9676; SSE-NEXT:    # xmm6 = mem[0,0,1,1]
9677; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
9678; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
9679; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9680; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
9681; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9682; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9683; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9684; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9685; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9686; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9687; SSE-NEXT:    # xmm5 = mem[0,0,1,1]
9688; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
9689; SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
9690; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
9691; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9692; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9693; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9694; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9695; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9696; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9697; SSE-NEXT:    # xmm4 = mem[0,0,1,1]
9698; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
9699; SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
9700; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9701; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
9702; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9703; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9704; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9705; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9706; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9707; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9708; SSE-NEXT:    # xmm3 = mem[0,0,1,1]
9709; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
9710; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
9711; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
9712; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9713; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
9714; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9715; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9716; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
9717; SSE-NEXT:    pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9718; SSE-NEXT:    # xmm2 = mem[0,0,1,1]
9719; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
9720; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
9721; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9722; SSE-NEXT:    movaps %xmm0, 224(%rsi)
9723; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9724; SSE-NEXT:    movaps %xmm0, 160(%rsi)
9725; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9726; SSE-NEXT:    movaps %xmm0, 96(%rsi)
9727; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9728; SSE-NEXT:    movaps %xmm0, 32(%rsi)
9729; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9730; SSE-NEXT:    movaps %xmm0, 240(%rsi)
9731; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9732; SSE-NEXT:    movaps %xmm0, 176(%rsi)
9733; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9734; SSE-NEXT:    movaps %xmm0, 112(%rsi)
9735; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9736; SSE-NEXT:    movaps %xmm0, 48(%rsi)
9737; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9738; SSE-NEXT:    movaps %xmm0, 192(%rsi)
9739; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9740; SSE-NEXT:    movaps %xmm0, 128(%rsi)
9741; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9742; SSE-NEXT:    movaps %xmm0, 64(%rsi)
9743; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9744; SSE-NEXT:    movaps %xmm0, (%rsi)
9745; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9746; SSE-NEXT:    movaps %xmm0, 208(%rsi)
9747; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9748; SSE-NEXT:    movaps %xmm0, 144(%rsi)
9749; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9750; SSE-NEXT:    movaps %xmm0, 80(%rsi)
9751; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9752; SSE-NEXT:    movaps %xmm0, 16(%rsi)
9753; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9754; SSE-NEXT:    movaps %xmm0, 224(%rdx)
9755; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9756; SSE-NEXT:    movaps %xmm0, 240(%rdx)
9757; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9758; SSE-NEXT:    movaps %xmm0, 192(%rdx)
9759; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9760; SSE-NEXT:    movaps %xmm0, 208(%rdx)
9761; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9762; SSE-NEXT:    movaps %xmm0, 160(%rdx)
9763; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9764; SSE-NEXT:    movaps %xmm0, 176(%rdx)
9765; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9766; SSE-NEXT:    movaps %xmm0, 128(%rdx)
9767; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9768; SSE-NEXT:    movaps %xmm0, 144(%rdx)
9769; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9770; SSE-NEXT:    movaps %xmm0, 96(%rdx)
9771; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9772; SSE-NEXT:    movaps %xmm0, 112(%rdx)
9773; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9774; SSE-NEXT:    movaps %xmm0, 64(%rdx)
9775; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9776; SSE-NEXT:    movaps %xmm0, 80(%rdx)
9777; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9778; SSE-NEXT:    movaps %xmm0, 32(%rdx)
9779; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9780; SSE-NEXT:    movaps %xmm0, 48(%rdx)
9781; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9782; SSE-NEXT:    movaps %xmm0, (%rdx)
9783; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9784; SSE-NEXT:    movaps %xmm0, 16(%rdx)
9785; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9786; SSE-NEXT:    movaps %xmm0, 240(%rcx)
9787; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9788; SSE-NEXT:    movaps %xmm0, 224(%rcx)
9789; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9790; SSE-NEXT:    movaps %xmm0, 208(%rcx)
9791; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9792; SSE-NEXT:    movaps %xmm0, 192(%rcx)
9793; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9794; SSE-NEXT:    movaps %xmm0, 176(%rcx)
9795; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9796; SSE-NEXT:    movaps %xmm0, 160(%rcx)
9797; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9798; SSE-NEXT:    movaps %xmm0, 144(%rcx)
9799; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9800; SSE-NEXT:    movaps %xmm0, 128(%rcx)
9801; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9802; SSE-NEXT:    movaps %xmm0, 112(%rcx)
9803; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9804; SSE-NEXT:    movaps %xmm0, 96(%rcx)
9805; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9806; SSE-NEXT:    movaps %xmm0, 80(%rcx)
9807; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9808; SSE-NEXT:    movaps %xmm0, 64(%rcx)
9809; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9810; SSE-NEXT:    movaps %xmm0, 48(%rcx)
9811; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9812; SSE-NEXT:    movaps %xmm0, 32(%rcx)
9813; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9814; SSE-NEXT:    movaps %xmm0, 16(%rcx)
9815; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9816; SSE-NEXT:    movaps %xmm0, (%rcx)
9817; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9818; SSE-NEXT:    movaps %xmm0, 240(%r8)
9819; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9820; SSE-NEXT:    movaps %xmm0, 224(%r8)
9821; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9822; SSE-NEXT:    movaps %xmm0, 208(%r8)
9823; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9824; SSE-NEXT:    movaps %xmm0, 192(%r8)
9825; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9826; SSE-NEXT:    movaps %xmm0, 176(%r8)
9827; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9828; SSE-NEXT:    movaps %xmm0, 160(%r8)
9829; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9830; SSE-NEXT:    movaps %xmm0, 144(%r8)
9831; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9832; SSE-NEXT:    movaps %xmm0, 128(%r8)
9833; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9834; SSE-NEXT:    movaps %xmm0, 112(%r8)
9835; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9836; SSE-NEXT:    movaps %xmm0, 96(%r8)
9837; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9838; SSE-NEXT:    movaps %xmm0, 80(%r8)
9839; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9840; SSE-NEXT:    movaps %xmm0, 64(%r8)
9841; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9842; SSE-NEXT:    movaps %xmm0, 48(%r8)
9843; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9844; SSE-NEXT:    movaps %xmm0, 32(%r8)
9845; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9846; SSE-NEXT:    movaps %xmm0, 16(%r8)
9847; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9848; SSE-NEXT:    movaps %xmm0, (%r8)
9849; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9850; SSE-NEXT:    movaps %xmm0, 240(%r9)
9851; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9852; SSE-NEXT:    movaps %xmm0, 224(%r9)
9853; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9854; SSE-NEXT:    movaps %xmm0, 208(%r9)
9855; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9856; SSE-NEXT:    movaps %xmm0, 192(%r9)
9857; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9858; SSE-NEXT:    movaps %xmm0, 176(%r9)
9859; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9860; SSE-NEXT:    movaps %xmm0, 160(%r9)
9861; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9862; SSE-NEXT:    movaps %xmm0, 144(%r9)
9863; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9864; SSE-NEXT:    movaps %xmm0, 128(%r9)
9865; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9866; SSE-NEXT:    movaps %xmm0, 112(%r9)
9867; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9868; SSE-NEXT:    movaps %xmm0, 96(%r9)
9869; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9870; SSE-NEXT:    movaps %xmm0, 80(%r9)
9871; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9872; SSE-NEXT:    movaps %xmm0, 64(%r9)
9873; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9874; SSE-NEXT:    movaps %xmm0, 48(%r9)
9875; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9876; SSE-NEXT:    movaps %xmm0, 32(%r9)
9877; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9878; SSE-NEXT:    movaps %xmm0, 16(%r9)
9879; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9880; SSE-NEXT:    movaps %xmm0, (%r9)
9881; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9882; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9883; SSE-NEXT:    movaps %xmm0, 240(%rax)
9884; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9885; SSE-NEXT:    movaps %xmm0, 224(%rax)
9886; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9887; SSE-NEXT:    movaps %xmm0, 208(%rax)
9888; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9889; SSE-NEXT:    movaps %xmm0, 192(%rax)
9890; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9891; SSE-NEXT:    movaps %xmm0, 176(%rax)
9892; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9893; SSE-NEXT:    movaps %xmm0, 160(%rax)
9894; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9895; SSE-NEXT:    movaps %xmm0, 144(%rax)
9896; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9897; SSE-NEXT:    movaps %xmm0, 128(%rax)
9898; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9899; SSE-NEXT:    movaps %xmm0, 112(%rax)
9900; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9901; SSE-NEXT:    movaps %xmm0, 96(%rax)
9902; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9903; SSE-NEXT:    movaps %xmm0, 80(%rax)
9904; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9905; SSE-NEXT:    movaps %xmm0, 64(%rax)
9906; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9907; SSE-NEXT:    movaps %xmm0, 48(%rax)
9908; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9909; SSE-NEXT:    movaps %xmm0, 32(%rax)
9910; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9911; SSE-NEXT:    movaps %xmm0, 16(%rax)
9912; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9913; SSE-NEXT:    movaps %xmm0, (%rax)
9914; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9915; SSE-NEXT:    movapd %xmm2, 240(%rax)
9916; SSE-NEXT:    movapd %xmm3, 224(%rax)
9917; SSE-NEXT:    movapd %xmm4, 208(%rax)
9918; SSE-NEXT:    movapd %xmm5, 192(%rax)
9919; SSE-NEXT:    movapd %xmm6, 176(%rax)
9920; SSE-NEXT:    movapd %xmm7, 160(%rax)
9921; SSE-NEXT:    movapd %xmm8, 144(%rax)
9922; SSE-NEXT:    movapd %xmm9, 128(%rax)
9923; SSE-NEXT:    movapd %xmm10, 112(%rax)
9924; SSE-NEXT:    movapd %xmm11, 96(%rax)
9925; SSE-NEXT:    movapd %xmm12, 80(%rax)
9926; SSE-NEXT:    movapd %xmm13, 64(%rax)
9927; SSE-NEXT:    movapd %xmm14, 48(%rax)
9928; SSE-NEXT:    movapd %xmm15, 32(%rax)
9929; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9930; SSE-NEXT:    movaps %xmm0, 16(%rax)
9931; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9932; SSE-NEXT:    movaps %xmm0, (%rax)
9933; SSE-NEXT:    addq $2456, %rsp # imm = 0x998
9934; SSE-NEXT:    retq
9935;
9936; AVX-LABEL: load_i32_stride7_vf64:
9937; AVX:       # %bb.0:
9938; AVX-NEXT:    subq $3176, %rsp # imm = 0xC68
9939; AVX-NEXT:    vmovaps 704(%rdi), %ymm2
9940; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9941; AVX-NEXT:    vmovaps 672(%rdi), %ymm3
9942; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9943; AVX-NEXT:    vmovaps 768(%rdi), %ymm11
9944; AVX-NEXT:    vmovaps 256(%rdi), %ymm4
9945; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9946; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
9947; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9948; AVX-NEXT:    vmovaps 320(%rdi), %ymm6
9949; AVX-NEXT:    vmovaps 304(%rdi), %xmm0
9950; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9951; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
9952; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9953; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
9954; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
9955; AVX-NEXT:    vmovaps 224(%rdi), %xmm10
9956; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
9957; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
9958; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
9959; AVX-NEXT:    vmovaps 384(%rdi), %xmm1
9960; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9961; AVX-NEXT:    vmovaps 352(%rdi), %xmm4
9962; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9963; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1]
9964; AVX-NEXT:    vmovaps 416(%rdi), %xmm4
9965; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9966; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm4[1]
9967; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
9968; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
9969; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9970; AVX-NEXT:    vmovaps 752(%rdi), %xmm0
9971; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9972; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
9973; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9974; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
9975; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
9976; AVX-NEXT:    vmovaps 672(%rdi), %xmm15
9977; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
9978; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
9979; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
9980; AVX-NEXT:    vmovaps 832(%rdi), %xmm1
9981; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9982; AVX-NEXT:    vmovaps 800(%rdi), %xmm2
9983; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9984; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
9985; AVX-NEXT:    vmovaps 864(%rdi), %xmm13
9986; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[1]
9987; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
9988; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
9989; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9990; AVX-NEXT:    vmovaps 1152(%rdi), %ymm1
9991; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9992; AVX-NEXT:    vmovaps 1120(%rdi), %ymm0
9993; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9994; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
9995; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
9996; AVX-NEXT:    vmovaps 1120(%rdi), %xmm1
9997; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9998; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9999; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
10000; AVX-NEXT:    vmovaps 1216(%rdi), %ymm9
10001; AVX-NEXT:    vmovaps 1200(%rdi), %xmm1
10002; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10003; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
10004; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10005; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10006; AVX-NEXT:    vmovaps 1280(%rdi), %xmm1
10007; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10008; AVX-NEXT:    vmovaps 1248(%rdi), %xmm2
10009; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10010; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
10011; AVX-NEXT:    vmovaps 1312(%rdi), %xmm2
10012; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10013; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
10014; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10015; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10016; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10017; AVX-NEXT:    vmovaps 1600(%rdi), %ymm1
10018; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10019; AVX-NEXT:    vmovaps 1568(%rdi), %ymm0
10020; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10021; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
10022; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
10023; AVX-NEXT:    vmovaps 1568(%rdi), %xmm1
10024; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10025; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10026; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
10027; AVX-NEXT:    vmovaps 1664(%rdi), %ymm2
10028; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10029; AVX-NEXT:    vmovaps 1648(%rdi), %xmm1
10030; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10031; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
10032; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10033; AVX-NEXT:    vmovaps 1728(%rdi), %xmm1
10034; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10035; AVX-NEXT:    vmovaps 1696(%rdi), %xmm2
10036; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10037; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
10038; AVX-NEXT:    vmovaps 1760(%rdi), %xmm2
10039; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10040; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
10041; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10042; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10043; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10044; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
10045; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10046; AVX-NEXT:    vmovaps (%rdi), %ymm0
10047; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10048; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
10049; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
10050; AVX-NEXT:    vmovaps (%rdi), %xmm1
10051; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10052; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10053; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
10054; AVX-NEXT:    vmovaps 96(%rdi), %ymm14
10055; AVX-NEXT:    vmovaps 80(%rdi), %xmm1
10056; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10057; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2]
10058; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10059; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10060; AVX-NEXT:    vmovaps 160(%rdi), %xmm1
10061; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10062; AVX-NEXT:    vmovaps 128(%rdi), %xmm2
10063; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10064; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
10065; AVX-NEXT:    vmovaps 192(%rdi), %xmm2
10066; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10067; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
10068; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10069; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10070; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10071; AVX-NEXT:    vmovaps 480(%rdi), %ymm1
10072; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10073; AVX-NEXT:    vmovaps 448(%rdi), %ymm0
10074; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10075; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
10076; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
10077; AVX-NEXT:    vmovaps 448(%rdi), %xmm1
10078; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10079; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10080; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
10081; AVX-NEXT:    vmovaps 544(%rdi), %ymm2
10082; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10083; AVX-NEXT:    vmovaps 528(%rdi), %xmm1
10084; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10085; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
10086; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10087; AVX-NEXT:    vmovaps 608(%rdi), %xmm1
10088; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10089; AVX-NEXT:    vmovaps 576(%rdi), %xmm2
10090; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10091; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
10092; AVX-NEXT:    vmovaps 640(%rdi), %xmm2
10093; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10094; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
10095; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10096; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10097; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10098; AVX-NEXT:    vmovaps 928(%rdi), %ymm1
10099; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10100; AVX-NEXT:    vmovaps 896(%rdi), %ymm0
10101; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10102; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
10103; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
10104; AVX-NEXT:    vmovaps 896(%rdi), %xmm12
10105; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3]
10106; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
10107; AVX-NEXT:    vmovaps 992(%rdi), %ymm5
10108; AVX-NEXT:    vmovaps 976(%rdi), %xmm1
10109; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10110; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2]
10111; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10112; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10113; AVX-NEXT:    vmovaps 1056(%rdi), %xmm1
10114; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10115; AVX-NEXT:    vmovaps 1024(%rdi), %xmm2
10116; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10117; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
10118; AVX-NEXT:    vmovaps 1088(%rdi), %xmm8
10119; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1]
10120; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10121; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10122; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10123; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10124; AVX-NEXT:    vmovaps 1376(%rdi), %ymm1
10125; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10126; AVX-NEXT:    vmovaps 1344(%rdi), %ymm0
10127; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10128; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
10129; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
10130; AVX-NEXT:    vmovaps 1344(%rdi), %xmm1
10131; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10132; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10133; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
10134; AVX-NEXT:    vmovaps 1440(%rdi), %ymm4
10135; AVX-NEXT:    vmovaps 1424(%rdi), %xmm1
10136; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10137; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
10138; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10139; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10140; AVX-NEXT:    vmovaps 1504(%rdi), %xmm1
10141; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10142; AVX-NEXT:    vmovaps 1472(%rdi), %xmm2
10143; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10144; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
10145; AVX-NEXT:    vmovaps 1536(%rdi), %xmm2
10146; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10147; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
10148; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10149; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10150; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10151; AVX-NEXT:    vmovaps 288(%rdi), %ymm0
10152; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10153; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6]
10154; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
10155; AVX-NEXT:    vmovaps 256(%rdi), %xmm1
10156; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10157; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3]
10158; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10159; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
10160; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
10161; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
10162; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10163; AVX-NEXT:    vmovaps 352(%rdi), %ymm2
10164; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10165; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
10166; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
10167; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10168; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10169; AVX-NEXT:    # xmm1 = zero,xmm1[1,2],mem[0]
10170; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10171; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10172; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10173; AVX-NEXT:    vmovaps 736(%rdi), %ymm0
10174; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10175; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6]
10176; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
10177; AVX-NEXT:    vmovaps 704(%rdi), %xmm1
10178; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10179; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3]
10180; AVX-NEXT:    vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10181; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
10182; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
10183; AVX-NEXT:    vmovaps 832(%rdi), %ymm1
10184; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10185; AVX-NEXT:    vmovaps 800(%rdi), %ymm2
10186; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10187; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
10188; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
10189; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10190; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[2]
10191; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10192; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10193; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10194; AVX-NEXT:    vmovaps 1184(%rdi), %ymm0
10195; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10196; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6]
10197; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
10198; AVX-NEXT:    vmovaps 1152(%rdi), %xmm1
10199; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10200; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10201; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3]
10202; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
10203; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
10204; AVX-NEXT:    vmovaps 1280(%rdi), %ymm1
10205; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10206; AVX-NEXT:    vmovaps 1248(%rdi), %ymm2
10207; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10208; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
10209; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
10210; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10211; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10212; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[2]
10213; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10214; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10215; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10216; AVX-NEXT:    vmovaps 1632(%rdi), %ymm0
10217; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10218; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10219; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6]
10220; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
10221; AVX-NEXT:    vmovaps 1600(%rdi), %xmm1
10222; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10223; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10224; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3]
10225; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
10226; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
10227; AVX-NEXT:    vmovaps 1728(%rdi), %ymm1
10228; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10229; AVX-NEXT:    vmovaps 1696(%rdi), %ymm2
10230; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10231; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
10232; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
10233; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
10234; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10235; AVX-NEXT:    # xmm1 = zero,xmm1[1,2],mem[0]
10236; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
10237; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10238; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10239; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
10240; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10241; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6]
10242; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
10243; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
10244; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10245; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10246; AVX-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
10247; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
10248; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7]
10249; AVX-NEXT:    vmovaps 160(%rdi), %ymm2
10250; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10251; AVX-NEXT:    vmovaps 128(%rdi), %ymm0
10252; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10253; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1]
10254; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[3,3],ymm0[4,4],ymm2[7,7]
10255; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
10256; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10257; AVX-NEXT:    # xmm2 = zero,xmm2[1,2],mem[0]
10258; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
10259; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
10260; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10261; AVX-NEXT:    vmovaps 512(%rdi), %ymm0
10262; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10263; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10264; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6]
10265; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
10266; AVX-NEXT:    vmovaps 480(%rdi), %xmm0
10267; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10268; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10269; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0],xmm14[1],xmm0[2,3]
10270; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3]
10271; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7]
10272; AVX-NEXT:    vmovaps 608(%rdi), %ymm0
10273; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10274; AVX-NEXT:    vmovaps 576(%rdi), %ymm1
10275; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10276; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1]
10277; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7]
10278; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
10279; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10280; AVX-NEXT:    # xmm3 = zero,xmm3[1,2],mem[0]
10281; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
10282; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
10283; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10284; AVX-NEXT:    vmovaps 960(%rdi), %ymm0
10285; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10286; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6]
10287; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
10288; AVX-NEXT:    vmovaps 928(%rdi), %xmm0
10289; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10290; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3]
10291; AVX-NEXT:    vmovaps %xmm12, %xmm6
10292; AVX-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10293; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3]
10294; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
10295; AVX-NEXT:    vmovaps 1056(%rdi), %ymm0
10296; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10297; AVX-NEXT:    vmovaps 1024(%rdi), %ymm1
10298; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10299; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[0,1]
10300; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[0,0],ymm5[3,3],ymm1[4,4],ymm5[7,7]
10301; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
10302; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm8[2]
10303; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
10304; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
10305; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10306; AVX-NEXT:    vmovaps 1408(%rdi), %ymm0
10307; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10308; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6]
10309; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
10310; AVX-NEXT:    vmovaps 1376(%rdi), %xmm4
10311; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10312; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm4[0],xmm3[1],xmm4[2,3]
10313; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3]
10314; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm5[3,4,5,6,7]
10315; AVX-NEXT:    vmovaps 1504(%rdi), %ymm0
10316; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10317; AVX-NEXT:    vmovaps 1472(%rdi), %ymm1
10318; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10319; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[0,1]
10320; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[0,0],ymm8[3,3],ymm1[4,4],ymm8[7,7]
10321; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
10322; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10323; AVX-NEXT:    # xmm8 = zero,xmm8[1,2],mem[0]
10324; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
10325; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6,7]
10326; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10327; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm10[2,3,2,3]
10328; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10329; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
10330; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10331; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
10332; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
10333; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10334; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
10335; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
10336; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10337; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10338; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
10339; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
10340; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10341; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
10342; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
10343; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10344; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10345; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3]
10346; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10347; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
10348; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10349; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
10350; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
10351; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10352; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
10353; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
10354; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10355; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10356; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
10357; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
10358; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3]
10359; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
10360; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10361; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10362; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm11[2,3,2,3]
10363; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10364; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
10365; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10366; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
10367; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
10368; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10369; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
10370; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
10371; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10372; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10373; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10374; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
10375; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm9[3]
10376; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
10377; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10378; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10379; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm7[2,3,2,3]
10380; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10381; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3]
10382; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10383; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
10384; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
10385; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10386; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
10387; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
10388; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10389; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10390; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm11[0],ymm15[2],ymm11[2]
10391; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
10392; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10393; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
10394; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
10395; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10396; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10397; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
10398; AVX-NEXT:    # xmm8 = mem[2,3,2,3]
10399; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10400; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
10401; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10402; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
10403; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
10404; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10405; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
10406; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
10407; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10408; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
10409; AVX-NEXT:    # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
10410; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
10411; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10412; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
10413; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
10414; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10415; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10416; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm14[2,3,2,3]
10417; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10418; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
10419; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10420; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
10421; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
10422; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10423; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
10424; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
10425; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10426; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
10427; AVX-NEXT:    # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
10428; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
10429; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10430; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
10431; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
10432; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10433; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10434; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[2,3,2,3]
10435; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10436; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
10437; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10438; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
10439; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
10440; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
10441; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm14[2,1],ymm12[2,0],ymm14[6,5],ymm12[6,4]
10442; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
10443; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10444; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
10445; AVX-NEXT:    # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
10446; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
10447; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10448; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
10449; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
10450; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10451; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10452; AVX-NEXT:    vmovaps %xmm3, %xmm9
10453; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[2,3,2,3]
10454; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3]
10455; AVX-NEXT:    vmovaps %xmm4, %xmm6
10456; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10457; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10458; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10459; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm5[0,3],ymm12[7,5],ymm5[4,7]
10460; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10461; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm4[2,1],ymm12[2,0],ymm4[6,5],ymm12[6,4]
10462; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
10463; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10464; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10465; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
10466; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
10467; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10468; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
10469; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
10470; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10471; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10472; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10473; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10474; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4]
10475; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6]
10476; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10477; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
10478; AVX-NEXT:    # xmm12 = mem[0,1,2],xmm0[3]
10479; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
10480; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
10481; AVX-NEXT:    vmovaps 416(%rdi), %ymm0
10482; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10483; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm10[1,3],ymm0[4,5],ymm10[5,7]
10484; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10485; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
10486; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10487; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10488; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10489; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10490; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4]
10491; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6]
10492; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10493; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
10494; AVX-NEXT:    # xmm12 = mem[0,1,2],xmm0[3]
10495; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
10496; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
10497; AVX-NEXT:    vmovaps 864(%rdi), %ymm0
10498; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10499; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm2[1,3],ymm0[4,5],ymm2[5,7]
10500; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10501; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
10502; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10503; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10504; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10505; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10506; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4]
10507; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm2[3,1],ymm8[0,2],ymm2[7,5],ymm8[4,6]
10508; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10509; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
10510; AVX-NEXT:    # xmm12 = mem[0,1,2],xmm0[3]
10511; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
10512; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
10513; AVX-NEXT:    vmovaps 1312(%rdi), %ymm0
10514; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10515; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm1[1,3],ymm0[4,5],ymm1[5,7]
10516; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10517; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
10518; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10519; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10520; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10521; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10522; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4]
10523; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[3,1],ymm8[0,2],ymm1[7,5],ymm8[4,6]
10524; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload
10525; AVX-NEXT:    # xmm12 = xmm13[0,1,2],mem[3]
10526; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
10527; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
10528; AVX-NEXT:    vmovaps 1760(%rdi), %ymm0
10529; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10530; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm11[1,3],ymm0[4,5],ymm11[5,7]
10531; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm15[0,2],ymm12[2,0],ymm15[4,6],ymm12[6,4]
10532; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm12[5,6,7]
10533; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10534; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm4[0,0],ymm5[5,4],ymm4[4,4]
10535; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6]
10536; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm9[3]
10537; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
10538; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
10539; AVX-NEXT:    vmovaps 1536(%rdi), %ymm0
10540; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10541; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7]
10542; AVX-NEXT:    vmovaps %ymm3, %ymm15
10543; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm4[2,0],ymm7[4,6],ymm4[6,4]
10544; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
10545; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10546; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10547; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[0,0],ymm1[5,4],ymm14[4,4]
10548; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[3,1],ymm2[0,2],ymm14[7,5],ymm2[4,6]
10549; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10550; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload
10551; AVX-NEXT:    # xmm4 = xmm12[0,1,2],mem[3]
10552; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
10553; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
10554; AVX-NEXT:    vmovaps 1088(%rdi), %ymm11
10555; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10556; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[0,1],ymm10[1,3],ymm11[4,5],ymm10[5,7]
10557; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10558; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
10559; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4]
10560; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
10561; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10562; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10563; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10564; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
10565; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6]
10566; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10567; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload
10568; AVX-NEXT:    # xmm4 = xmm9[0,1,2],mem[3]
10569; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
10570; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
10571; AVX-NEXT:    vmovaps 640(%rdi), %ymm8
10572; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10573; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm5[1,3],ymm8[4,5],ymm5[5,7]
10574; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10575; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10576; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[0,2],ymm4[2,0],ymm3[4,6],ymm4[6,4]
10577; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
10578; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10579; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10580; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10581; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
10582; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6]
10583; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10584; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload
10585; AVX-NEXT:    # xmm4 = xmm6[0,1,2],mem[3]
10586; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
10587; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
10588; AVX-NEXT:    vmovaps 192(%rdi), %ymm13
10589; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10590; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10591; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7]
10592; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10593; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm7[2,0],ymm2[4,6],ymm7[6,4]
10594; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm7[5,6,7]
10595; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10596; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
10597; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm4[0,0],ymm2[7,4],ymm4[4,4]
10598; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4]
10599; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4]
10600; AVX-NEXT:    vmovaps 64(%rdi), %xmm2
10601; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10602; AVX-NEXT:    vmovaps 96(%rdi), %xmm1
10603; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10604; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1]
10605; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
10606; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
10607; AVX-NEXT:    vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
10608; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
10609; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10610; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10611; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10612; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
10613; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
10614; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10615; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10616; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4]
10617; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4]
10618; AVX-NEXT:    vmovaps 320(%rdi), %xmm1
10619; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10620; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1]
10621; AVX-NEXT:    vmovaps 288(%rdi), %xmm1
10622; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10623; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
10624; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
10625; AVX-NEXT:    # xmm6 = mem[2,3,2,3]
10626; AVX-NEXT:    vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
10627; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
10628; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10629; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10630; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
10631; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4]
10632; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm5[2,0],ymm8[5,4],ymm5[6,4]
10633; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
10634; AVX-NEXT:    vmovaps 544(%rdi), %xmm1
10635; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10636; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10637; AVX-NEXT:    vmovaps 512(%rdi), %xmm6
10638; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
10639; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3]
10640; AVX-NEXT:    vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3]
10641; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
10642; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10643; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10644; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10645; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
10646; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
10647; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10648; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10649; AVX-NEXT:    # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4]
10650; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
10651; AVX-NEXT:    vmovaps 768(%rdi), %xmm1
10652; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10653; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10654; AVX-NEXT:    vmovaps 736(%rdi), %xmm4
10655; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
10656; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10657; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
10658; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
10659; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
10660; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10661; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10662; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1]
10663; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4]
10664; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4]
10665; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
10666; AVX-NEXT:    vmovaps 992(%rdi), %xmm1
10667; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10668; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10669; AVX-NEXT:    vmovaps 960(%rdi), %xmm3
10670; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
10671; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm12[2,3,2,3]
10672; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
10673; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
10674; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10675; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10676; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10677; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
10678; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
10679; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10680; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
10681; AVX-NEXT:    # ymm1 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4]
10682; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
10683; AVX-NEXT:    vmovaps 1216(%rdi), %xmm1
10684; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10685; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10686; AVX-NEXT:    vmovaps 1184(%rdi), %xmm2
10687; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10688; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
10689; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
10690; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
10691; AVX-NEXT:    vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
10692; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
10693; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10694; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10695; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10696; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
10697; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
10698; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10699; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4]
10700; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
10701; AVX-NEXT:    vmovaps 1440(%rdi), %xmm1
10702; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10703; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1]
10704; AVX-NEXT:    vmovaps 1408(%rdi), %xmm1
10705; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3]
10706; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
10707; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
10708; AVX-NEXT:    vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3]
10709; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
10710; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
10711; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10712; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10713; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
10714; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4]
10715; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10716; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10717; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4]
10718; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm10[2,0],ymm0[6,4],ymm10[6,4]
10719; AVX-NEXT:    vmovaps 1664(%rdi), %xmm0
10720; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10721; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1]
10722; AVX-NEXT:    vmovaps 1632(%rdi), %xmm0
10723; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3]
10724; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10725; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
10726; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
10727; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3]
10728; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm10[4,5,6,7]
10729; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10730; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10731; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload
10732; AVX-NEXT:    # ymm9 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7]
10733; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10734; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload
10735; AVX-NEXT:    # xmm10 = mem[0],xmm2[1],mem[2,3]
10736; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
10737; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm9[2,0],ymm10[5,4],ymm9[6,4]
10738; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10739; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm8 # 16-byte Folded Reload
10740; AVX-NEXT:    # xmm8 = xmm14[0,1,2],mem[3]
10741; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10742; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10743; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm2[0,0],ymm12[1,0],ymm2[4,4],ymm12[5,4]
10744; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
10745; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[3,2]
10746; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
10747; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10748; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm13[2,1],ymm7[3,3],ymm13[6,5],ymm7[7,7]
10749; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10750; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
10751; AVX-NEXT:    # xmm9 = mem[0],xmm9[1],mem[2,3]
10752; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
10753; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4]
10754; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10755; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
10756; AVX-NEXT:    # xmm7 = mem[0,1,2],xmm7[3]
10757; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10758; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10759; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,0],ymm13[1,0],ymm10[4,4],ymm13[5,4]
10760; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
10761; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm9[2,0],xmm7[3,2]
10762; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
10763; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10764; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10765; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
10766; AVX-NEXT:    # ymm7 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7]
10767; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10768; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10769; AVX-NEXT:    # xmm8 = mem[0],xmm8[1],mem[2,3]
10770; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
10771; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4]
10772; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
10773; AVX-NEXT:    # xmm6 = mem[0,1,2],xmm6[3]
10774; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10775; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
10776; AVX-NEXT:    # ymm8 = ymm9[0,0],mem[1,0],ymm9[4,4],mem[5,4]
10777; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
10778; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2]
10779; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
10780; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10781; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10782; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10783; AVX-NEXT:    # ymm6 = ymm6[2,1],mem[3,3],ymm6[6,5],mem[7,7]
10784; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10785; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
10786; AVX-NEXT:    # xmm7 = mem[0],xmm7[1],mem[2,3]
10787; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
10788; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4]
10789; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10790; AVX-NEXT:    # xmm4 = mem[0,1,2],xmm4[3]
10791; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10792; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload
10793; AVX-NEXT:    # ymm7 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4]
10794; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
10795; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm7[2,0],xmm4[3,2]
10796; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
10797; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10798; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10799; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
10800; AVX-NEXT:    # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7]
10801; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10802; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
10803; AVX-NEXT:    # xmm6 = mem[0],xmm6[1],mem[2,3]
10804; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
10805; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4]
10806; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10807; AVX-NEXT:    # xmm3 = mem[0,1,2],xmm3[3]
10808; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10809; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
10810; AVX-NEXT:    # ymm6 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4]
10811; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
10812; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[3,2]
10813; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
10814; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10815; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
10816; AVX-NEXT:    # ymm3 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7]
10817; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10818; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10819; AVX-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3]
10820; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
10821; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
10822; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10823; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload
10824; AVX-NEXT:    # xmm4 = xmm11[0,1,2],mem[3]
10825; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10826; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10827; AVX-NEXT:    # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4]
10828; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
10829; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[3,2]
10830; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
10831; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10832; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10833; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
10834; AVX-NEXT:    # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7]
10835; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10836; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10837; AVX-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3]
10838; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
10839; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
10840; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10841; AVX-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
10842; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10843; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
10844; AVX-NEXT:    # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4]
10845; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
10846; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[3,2]
10847; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
10848; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10849; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[2,1],ymm15[3,3],ymm5[6,5],ymm15[7,7]
10850; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10851; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10852; AVX-NEXT:    # xmm3 = mem[0],xmm3[1],mem[2,3]
10853; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
10854; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4]
10855; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10856; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
10857; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10858; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
10859; AVX-NEXT:    # ymm3 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4]
10860; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
10861; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[3,2]
10862; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10863; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10864; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10865; AVX-NEXT:    # xmm0 = mem[0,1,0,1]
10866; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3]
10867; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
10868; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
10869; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
10870; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10871; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
10872; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
10873; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10874; AVX-NEXT:    # xmm3 = mem[2,3,2,3]
10875; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10876; AVX-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
10877; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
10878; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
10879; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7]
10880; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10881; AVX-NEXT:    # xmm0 = mem[0,1,0,1]
10882; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10883; AVX-NEXT:    # xmm0 = xmm0[0,1,2],mem[3]
10884; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm13[2,0],ymm10[5,4],ymm13[6,4]
10885; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
10886; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
10887; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10888; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
10889; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
10890; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10891; AVX-NEXT:    # xmm4 = mem[2,3,2,3]
10892; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10893; AVX-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
10894; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
10895; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
10896; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7]
10897; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10898; AVX-NEXT:    # xmm0 = mem[0,1,0,1]
10899; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10900; AVX-NEXT:    # xmm0 = xmm0[0,1,2],mem[3]
10901; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
10902; AVX-NEXT:    # ymm3 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4]
10903; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
10904; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
10905; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10906; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
10907; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4]
10908; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10909; AVX-NEXT:    # xmm4 = mem[2,3,2,3]
10910; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10911; AVX-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
10912; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
10913; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
10914; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
10915; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10916; AVX-NEXT:    # xmm3 = mem[0,1,0,1]
10917; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10918; AVX-NEXT:    # xmm3 = xmm3[0,1,2],mem[3]
10919; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
10920; AVX-NEXT:    # ymm4 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4]
10921; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
10922; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3]
10923; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10924; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,0,1]
10925; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm4[0,0],ymm5[7,4],ymm4[4,4]
10926; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
10927; AVX-NEXT:    # xmm8 = mem[2,3,2,3]
10928; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10929; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
10930; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
10931; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,0],ymm8[4,5],ymm4[6,4]
10932; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
10933; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10934; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
10935; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10936; AVX-NEXT:    # xmm4 = xmm4[0,1,2],mem[3]
10937; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
10938; AVX-NEXT:    # ymm8 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4]
10939; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
10940; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[2,0],xmm4[2,3]
10941; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10942; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,0,1]
10943; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm5[3,0],ymm8[0,0],ymm5[7,4],ymm8[4,4]
10944; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10945; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
10946; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
10947; AVX-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
10948; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
10949; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4]
10950; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7]
10951; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10952; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
10953; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm11[3]
10954; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10955; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload
10956; AVX-NEXT:    # ymm9 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4]
10957; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
10958; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm9[2,0],xmm4[2,3]
10959; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10960; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1]
10961; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm5[3,0],ymm9[0,0],ymm5[7,4],ymm9[4,4]
10962; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
10963; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
10964; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
10965; AVX-NEXT:    # xmm10 = xmm10[0],mem[1],xmm10[2,3]
10966; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
10967; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4]
10968; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7]
10969; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10970; AVX-NEXT:    # xmm9 = mem[0,1,0,1]
10971; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
10972; AVX-NEXT:    # xmm5 = xmm9[0,1,2],mem[3]
10973; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10974; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
10975; AVX-NEXT:    # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4]
10976; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
10977; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm9[2,0],xmm5[2,3]
10978; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10979; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1]
10980; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm7[3,0],ymm9[0,0],ymm7[7,4],ymm9[4,4]
10981; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
10982; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
10983; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
10984; AVX-NEXT:    # xmm10 = xmm10[0],mem[1],xmm10[2,3]
10985; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
10986; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4]
10987; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
10988; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10989; AVX-NEXT:    # xmm9 = mem[0,1,0,1]
10990; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3]
10991; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
10992; AVX-NEXT:    # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
10993; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
10994; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm10[2,0],xmm9[2,3]
10995; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10996; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm6[2,3,0,1]
10997; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[3,0],ymm10[0,0],ymm6[7,4],ymm10[4,4]
10998; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
10999; AVX-NEXT:    # xmm12 = mem[2,3,2,3]
11000; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
11001; AVX-NEXT:    # xmm12 = xmm12[0],mem[1],xmm12[2,3]
11002; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
11003; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,0],ymm12[4,5],ymm10[6,4]
11004; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
11005; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11006; AVX-NEXT:    vmovaps %ymm6, 192(%rsi)
11007; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11008; AVX-NEXT:    vmovaps %ymm6, 128(%rsi)
11009; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11010; AVX-NEXT:    vmovaps %ymm6, 64(%rsi)
11011; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11012; AVX-NEXT:    vmovaps %ymm6, (%rsi)
11013; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11014; AVX-NEXT:    vmovaps %ymm6, 224(%rsi)
11015; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11016; AVX-NEXT:    vmovaps %ymm10, 160(%rsi)
11017; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11018; AVX-NEXT:    vmovaps %ymm10, 96(%rsi)
11019; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11020; AVX-NEXT:    vmovaps %ymm10, 32(%rsi)
11021; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11022; AVX-NEXT:    vmovaps %ymm6, 192(%rdx)
11023; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11024; AVX-NEXT:    vmovaps %ymm6, 128(%rdx)
11025; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11026; AVX-NEXT:    vmovaps %ymm6, 64(%rdx)
11027; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11028; AVX-NEXT:    vmovaps %ymm6, (%rdx)
11029; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11030; AVX-NEXT:    vmovaps %ymm6, 224(%rdx)
11031; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11032; AVX-NEXT:    vmovaps %ymm6, 160(%rdx)
11033; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11034; AVX-NEXT:    vmovaps %ymm6, 96(%rdx)
11035; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11036; AVX-NEXT:    vmovaps %ymm6, 32(%rdx)
11037; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11038; AVX-NEXT:    vmovaps %ymm6, 192(%rcx)
11039; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11040; AVX-NEXT:    vmovaps %ymm6, 128(%rcx)
11041; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11042; AVX-NEXT:    vmovaps %ymm6, 64(%rcx)
11043; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11044; AVX-NEXT:    vmovaps %ymm6, (%rcx)
11045; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11046; AVX-NEXT:    vmovaps %ymm6, 224(%rcx)
11047; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11048; AVX-NEXT:    vmovaps %ymm6, 160(%rcx)
11049; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11050; AVX-NEXT:    vmovaps %ymm6, 96(%rcx)
11051; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11052; AVX-NEXT:    vmovaps %ymm6, 32(%rcx)
11053; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11054; AVX-NEXT:    vmovaps %ymm6, (%r8)
11055; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11056; AVX-NEXT:    vmovaps %ymm6, 64(%r8)
11057; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11058; AVX-NEXT:    vmovaps %ymm6, 128(%r8)
11059; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11060; AVX-NEXT:    vmovaps %ymm6, 192(%r8)
11061; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11062; AVX-NEXT:    vmovaps %ymm6, 224(%r8)
11063; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11064; AVX-NEXT:    vmovaps %ymm6, 160(%r8)
11065; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11066; AVX-NEXT:    vmovaps %ymm6, 96(%r8)
11067; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11068; AVX-NEXT:    vmovaps %ymm6, 32(%r8)
11069; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11070; AVX-NEXT:    vmovaps %ymm6, 224(%r9)
11071; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11072; AVX-NEXT:    vmovaps %ymm6, 192(%r9)
11073; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11074; AVX-NEXT:    vmovaps %ymm6, 160(%r9)
11075; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11076; AVX-NEXT:    vmovaps %ymm6, 128(%r9)
11077; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11078; AVX-NEXT:    vmovaps %ymm6, 96(%r9)
11079; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11080; AVX-NEXT:    vmovaps %ymm6, 64(%r9)
11081; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11082; AVX-NEXT:    vmovaps %ymm6, 32(%r9)
11083; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11084; AVX-NEXT:    vmovaps %ymm6, (%r9)
11085; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11086; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11087; AVX-NEXT:    vmovaps %ymm6, 224(%rax)
11088; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11089; AVX-NEXT:    vmovaps %ymm6, 192(%rax)
11090; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11091; AVX-NEXT:    vmovaps %ymm6, 160(%rax)
11092; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11093; AVX-NEXT:    vmovaps %ymm6, 128(%rax)
11094; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11095; AVX-NEXT:    vmovaps %ymm6, 96(%rax)
11096; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11097; AVX-NEXT:    vmovaps %ymm6, 64(%rax)
11098; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11099; AVX-NEXT:    vmovaps %ymm6, 32(%rax)
11100; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11101; AVX-NEXT:    vmovaps %ymm6, (%rax)
11102; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11103; AVX-NEXT:    vmovaps %ymm9, 224(%rax)
11104; AVX-NEXT:    vmovaps %ymm5, 192(%rax)
11105; AVX-NEXT:    vmovaps %ymm4, 160(%rax)
11106; AVX-NEXT:    vmovaps %ymm8, 128(%rax)
11107; AVX-NEXT:    vmovaps %ymm3, 96(%rax)
11108; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
11109; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
11110; AVX-NEXT:    vmovaps %ymm2, (%rax)
11111; AVX-NEXT:    addq $3176, %rsp # imm = 0xC68
11112; AVX-NEXT:    vzeroupper
11113; AVX-NEXT:    retq
11114;
11115; AVX2-LABEL: load_i32_stride7_vf64:
11116; AVX2:       # %bb.0:
11117; AVX2-NEXT:    subq $2648, %rsp # imm = 0xA58
11118; AVX2-NEXT:    vmovdqa 1216(%rdi), %ymm9
11119; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11120; AVX2-NEXT:    vmovdqa 1152(%rdi), %ymm4
11121; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11122; AVX2-NEXT:    vmovdqa 1120(%rdi), %ymm5
11123; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm12
11124; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11125; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm6
11126; AVX2-NEXT:    vmovdqa 672(%rdi), %ymm7
11127; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm8
11128; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11129; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm10
11130; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm11
11131; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
11132; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7]
11133; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11134; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11135; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
11136; AVX2-NEXT:    vpbroadcastq 304(%rdi), %ymm2
11137; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
11138; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
11139; AVX2-NEXT:    vmovdqa 352(%rdi), %xmm2
11140; AVX2-NEXT:    vmovdqa 384(%rdi), %xmm3
11141; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11142; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
11143; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11144; AVX2-NEXT:    vpbroadcastd 420(%rdi), %ymm3
11145; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11146; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
11147; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11148; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
11149; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11150; AVX2-NEXT:    vmovdqa %ymm6, %ymm8
11151; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11152; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
11153; AVX2-NEXT:    vpbroadcastq 752(%rdi), %ymm2
11154; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
11155; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
11156; AVX2-NEXT:    vmovdqa 800(%rdi), %xmm2
11157; AVX2-NEXT:    vmovdqa 832(%rdi), %xmm3
11158; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11159; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
11160; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11161; AVX2-NEXT:    vpbroadcastd 868(%rdi), %ymm3
11162; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11163; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
11164; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11165; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
11166; AVX2-NEXT:    vmovdqa %ymm5, %ymm6
11167; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11168; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
11169; AVX2-NEXT:    vpbroadcastq 1200(%rdi), %ymm2
11170; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
11171; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
11172; AVX2-NEXT:    vmovdqa 1248(%rdi), %xmm2
11173; AVX2-NEXT:    vmovdqa 1280(%rdi), %xmm3
11174; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11175; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
11176; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11177; AVX2-NEXT:    vpbroadcastd 1316(%rdi), %ymm3
11178; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11179; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
11180; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11181; AVX2-NEXT:    vmovdqa 1600(%rdi), %ymm13
11182; AVX2-NEXT:    vmovdqa 1568(%rdi), %ymm5
11183; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7]
11184; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11185; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11186; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
11187; AVX2-NEXT:    vmovdqa 1664(%rdi), %ymm3
11188; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11189; AVX2-NEXT:    vpbroadcastq 1648(%rdi), %ymm2
11190; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
11191; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
11192; AVX2-NEXT:    vmovdqa 1696(%rdi), %xmm2
11193; AVX2-NEXT:    vmovdqa 1728(%rdi), %xmm3
11194; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11195; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
11196; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11197; AVX2-NEXT:    vpbroadcastd 1764(%rdi), %ymm3
11198; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11199; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
11200; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11201; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm2
11202; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11203; AVX2-NEXT:    vpbroadcastq 80(%rdi), %ymm1
11204; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
11205; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
11206; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11207; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm3
11208; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11209; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
11210; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
11211; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
11212; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm2
11213; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm3
11214; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11215; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
11216; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11217; AVX2-NEXT:    vpbroadcastd 196(%rdi), %ymm3
11218; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11219; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
11220; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11221; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm2
11222; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11223; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm1
11224; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11225; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
11226; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
11227; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm3
11228; AVX2-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
11229; AVX2-NEXT:    vpbroadcastq 528(%rdi), %ymm2
11230; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
11231; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
11232; AVX2-NEXT:    vmovdqa 576(%rdi), %xmm2
11233; AVX2-NEXT:    vmovdqa 608(%rdi), %xmm3
11234; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11235; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
11236; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11237; AVX2-NEXT:    vpbroadcastd 644(%rdi), %ymm3
11238; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11239; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
11240; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11241; AVX2-NEXT:    vmovdqa 928(%rdi), %ymm2
11242; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11243; AVX2-NEXT:    vmovdqa 896(%rdi), %ymm1
11244; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11245; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
11246; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
11247; AVX2-NEXT:    vmovdqa 992(%rdi), %ymm3
11248; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11249; AVX2-NEXT:    vpbroadcastq 976(%rdi), %ymm2
11250; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
11251; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
11252; AVX2-NEXT:    vmovdqa 1024(%rdi), %xmm2
11253; AVX2-NEXT:    vmovdqa 1056(%rdi), %xmm3
11254; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11255; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
11256; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11257; AVX2-NEXT:    vpbroadcastd 1092(%rdi), %ymm3
11258; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11259; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
11260; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11261; AVX2-NEXT:    vmovdqa 1376(%rdi), %ymm14
11262; AVX2-NEXT:    vmovdqa 1344(%rdi), %ymm15
11263; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7]
11264; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11265; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11266; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
11267; AVX2-NEXT:    vmovdqa 1440(%rdi), %ymm4
11268; AVX2-NEXT:    vpbroadcastq 1424(%rdi), %ymm1
11269; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
11270; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11271; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
11272; AVX2-NEXT:    vmovdqa 1472(%rdi), %xmm1
11273; AVX2-NEXT:    vmovdqa 1504(%rdi), %xmm2
11274; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11275; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
11276; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11277; AVX2-NEXT:    vpbroadcastd 1540(%rdi), %ymm2
11278; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11279; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11280; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11281; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
11282; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
11283; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm1
11284; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11285; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm0
11286; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11287; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
11288; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
11289; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11290; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm12
11291; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11292; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7]
11293; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11294; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
11295; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11296; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
11297; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
11298; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
11299; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
11300; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11301; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
11302; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11303; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm3
11304; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11305; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm2
11306; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11307; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
11308; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
11309; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11310; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm2
11311; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11312; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11313; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
11314; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
11315; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11316; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
11317; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
11318; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
11319; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11320; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
11321; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11322; AVX2-NEXT:    vmovdqa 1280(%rdi), %ymm3
11323; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11324; AVX2-NEXT:    vmovdqa 1248(%rdi), %ymm2
11325; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11326; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
11327; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
11328; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11329; AVX2-NEXT:    vmovdqa 1184(%rdi), %ymm2
11330; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11331; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11332; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
11333; AVX2-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
11334; AVX2-NEXT:    # ymm7 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7]
11335; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11336; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7]
11337; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
11338; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
11339; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11340; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
11341; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11342; AVX2-NEXT:    vmovdqa 1728(%rdi), %ymm3
11343; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11344; AVX2-NEXT:    vmovdqa 1696(%rdi), %ymm2
11345; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11346; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
11347; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
11348; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11349; AVX2-NEXT:    vmovdqa 1632(%rdi), %ymm2
11350; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11351; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11352; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7]
11353; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
11354; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11355; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7]
11356; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
11357; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
11358; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11359; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
11360; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11361; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm3
11362; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11363; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm2
11364; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11365; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
11366; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
11367; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11368; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm2
11369; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11370; AVX2-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
11371; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7]
11372; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11373; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11374; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
11375; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11376; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
11377; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
11378; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
11379; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11380; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
11381; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11382; AVX2-NEXT:    vmovdqa 1056(%rdi), %ymm3
11383; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11384; AVX2-NEXT:    vmovdqa 1024(%rdi), %ymm2
11385; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11386; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
11387; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
11388; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11389; AVX2-NEXT:    vmovdqa 960(%rdi), %ymm2
11390; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11391; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11392; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
11393; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11394; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11395; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
11396; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11397; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
11398; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
11399; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
11400; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11401; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
11402; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11403; AVX2-NEXT:    vmovdqa 1504(%rdi), %ymm3
11404; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11405; AVX2-NEXT:    vmovdqa 1472(%rdi), %ymm2
11406; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11407; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
11408; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
11409; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11410; AVX2-NEXT:    vmovdqa 1408(%rdi), %ymm2
11411; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11412; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
11413; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
11414; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11415; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
11416; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
11417; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
11418; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11419; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
11420; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11421; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm15
11422; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm14
11423; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27]
11424; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11425; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11426; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
11427; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11428; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm4
11429; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11430; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7]
11431; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11432; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11433; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11434; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
11435; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11436; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
11437; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm0
11438; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11439; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11440; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm0
11441; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
11442; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
11443; AVX2-NEXT:    vpbroadcastd 232(%rdi), %xmm1
11444; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm5
11445; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
11446; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11447; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11448; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11449; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
11450; AVX2-NEXT:    vpbroadcastd 428(%rdi), %ymm2
11451; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11452; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11453; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11454; AVX2-NEXT:    vmovdqa 752(%rdi), %xmm0
11455; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11456; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
11457; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
11458; AVX2-NEXT:    vpbroadcastd 680(%rdi), %xmm1
11459; AVX2-NEXT:    vmovdqa 704(%rdi), %xmm2
11460; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11461; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
11462; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11463; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11464; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11465; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
11466; AVX2-NEXT:    vpbroadcastd 876(%rdi), %ymm2
11467; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11468; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11469; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11470; AVX2-NEXT:    vmovdqa 1200(%rdi), %xmm0
11471; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11472; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
11473; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
11474; AVX2-NEXT:    vpbroadcastd 1128(%rdi), %xmm1
11475; AVX2-NEXT:    vmovdqa 1152(%rdi), %xmm2
11476; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11477; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
11478; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11479; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11480; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11481; AVX2-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
11482; AVX2-NEXT:    vpbroadcastd 1324(%rdi), %ymm2
11483; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11484; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11485; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11486; AVX2-NEXT:    vmovdqa 1648(%rdi), %xmm0
11487; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11488; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
11489; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
11490; AVX2-NEXT:    vpbroadcastd 1576(%rdi), %xmm1
11491; AVX2-NEXT:    vmovdqa 1600(%rdi), %xmm2
11492; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
11493; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11494; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11495; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11496; AVX2-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
11497; AVX2-NEXT:    vpbroadcastd 1772(%rdi), %ymm6
11498; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
11499; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11500; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11501; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm0
11502; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
11503; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
11504; AVX2-NEXT:    vpbroadcastd 8(%rdi), %xmm1
11505; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm4
11506; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11507; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
11508; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11509; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
11510; AVX2-NEXT:    vpbroadcastd 204(%rdi), %ymm6
11511; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
11512; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11513; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11514; AVX2-NEXT:    vmovdqa 528(%rdi), %xmm0
11515; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11516; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
11517; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
11518; AVX2-NEXT:    vpbroadcastd 456(%rdi), %xmm1
11519; AVX2-NEXT:    vmovdqa 480(%rdi), %xmm4
11520; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11521; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
11522; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11523; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11524; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11525; AVX2-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
11526; AVX2-NEXT:    vpbroadcastd 652(%rdi), %ymm15
11527; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
11528; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11529; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11530; AVX2-NEXT:    vmovdqa 976(%rdi), %xmm0
11531; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11532; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
11533; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
11534; AVX2-NEXT:    vpbroadcastd 904(%rdi), %xmm15
11535; AVX2-NEXT:    vmovdqa 928(%rdi), %xmm11
11536; AVX2-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3]
11537; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
11538; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11539; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11540; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2]
11541; AVX2-NEXT:    vpbroadcastd 1100(%rdi), %ymm14
11542; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
11543; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7]
11544; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11545; AVX2-NEXT:    vmovdqa 1424(%rdi), %xmm0
11546; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11547; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11548; AVX2-NEXT:    vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
11549; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7]
11550; AVX2-NEXT:    vpbroadcastd 1352(%rdi), %xmm15
11551; AVX2-NEXT:    vmovdqa 1376(%rdi), %xmm0
11552; AVX2-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
11553; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
11554; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11555; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11556; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
11557; AVX2-NEXT:    vpbroadcastd 1548(%rdi), %ymm13
11558; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
11559; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7]
11560; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11561; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11562; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
11563; AVX2-NEXT:    # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
11564; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3]
11565; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3]
11566; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
11567; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
11568; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11569; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7]
11570; AVX2-NEXT:    vmovaps %ymm4, %ymm12
11571; AVX2-NEXT:    vbroadcastss 432(%rdi), %ymm14
11572; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
11573; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7]
11574; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11575; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11576; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
11577; AVX2-NEXT:    # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
11578; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11579; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3]
11580; AVX2-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
11581; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
11582; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
11583; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11584; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7]
11585; AVX2-NEXT:    vbroadcastss 880(%rdi), %ymm13
11586; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7]
11587; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7]
11588; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11589; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11590; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
11591; AVX2-NEXT:    # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
11592; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11593; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
11594; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
11595; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
11596; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
11597; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11598; AVX2-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11599; AVX2-NEXT:    # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7]
11600; AVX2-NEXT:    vbroadcastss 1328(%rdi), %ymm5
11601; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
11602; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
11603; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11604; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11605; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11606; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
11607; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
11608; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
11609; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4]
11610; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
11611; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11612; AVX2-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
11613; AVX2-NEXT:    # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7]
11614; AVX2-NEXT:    vbroadcastss 1776(%rdi), %ymm4
11615; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
11616; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
11617; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11618; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7]
11619; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
11620; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
11621; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4]
11622; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
11623; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7]
11624; AVX2-NEXT:    vmovaps %ymm1, %ymm9
11625; AVX2-NEXT:    vbroadcastss 1552(%rdi), %ymm3
11626; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11627; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
11628; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11629; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11630; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11631; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
11632; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3]
11633; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
11634; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
11635; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11636; AVX2-NEXT:    vmovdqa %ymm10, %ymm8
11637; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7]
11638; AVX2-NEXT:    vmovaps %ymm7, %ymm11
11639; AVX2-NEXT:    vbroadcastss 1104(%rdi), %ymm2
11640; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11641; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11642; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11643; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
11644; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11645; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
11646; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11647; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
11648; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
11649; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
11650; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11651; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11652; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11653; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7]
11654; AVX2-NEXT:    vbroadcastss 656(%rdi), %ymm2
11655; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11656; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11657; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11658; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11659; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11660; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
11661; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11662; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
11663; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
11664; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
11665; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
11666; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11667; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11668; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7]
11669; AVX2-NEXT:    vbroadcastss 208(%rdi), %ymm2
11670; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11671; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
11672; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11673; AVX2-NEXT:    vbroadcastss 100(%rdi), %xmm0
11674; AVX2-NEXT:    vmovaps 64(%rdi), %xmm6
11675; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
11676; AVX2-NEXT:    vmovsd {{.*#+}} xmm5 = [4,3,0,0]
11677; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11678; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11679; AVX2-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
11680; AVX2-NEXT:    vpermps %ymm1, %ymm5, %ymm1
11681; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11682; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7]
11683; AVX2-NEXT:    vpermps %ymm3, %ymm15, %ymm1
11684; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7]
11685; AVX2-NEXT:    vbroadcastss 212(%rdi), %ymm2
11686; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11687; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11688; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11689; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11690; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11691; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
11692; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
11693; AVX2-NEXT:    vbroadcastss 324(%rdi), %xmm2
11694; AVX2-NEXT:    vmovaps 288(%rdi), %xmm1
11695; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
11696; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
11697; AVX2-NEXT:    vpermps %ymm12, %ymm15, %ymm2
11698; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11699; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
11700; AVX2-NEXT:    vbroadcastss 436(%rdi), %ymm3
11701; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11702; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
11703; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11704; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11705; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11706; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
11707; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
11708; AVX2-NEXT:    vbroadcastss 548(%rdi), %xmm3
11709; AVX2-NEXT:    vmovaps 512(%rdi), %xmm2
11710; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
11711; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
11712; AVX2-NEXT:    vpermps %ymm4, %ymm15, %ymm3
11713; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
11714; AVX2-NEXT:    vbroadcastss 660(%rdi), %ymm4
11715; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
11716; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
11717; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
11718; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11719; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11720; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
11721; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
11722; AVX2-NEXT:    vbroadcastss 772(%rdi), %xmm4
11723; AVX2-NEXT:    vmovaps 736(%rdi), %xmm3
11724; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
11725; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
11726; AVX2-NEXT:    vpermps %ymm14, %ymm15, %ymm4
11727; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11728; AVX2-NEXT:    # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
11729; AVX2-NEXT:    vbroadcastss 884(%rdi), %ymm7
11730; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
11731; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
11732; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11733; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11734; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11735; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
11736; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
11737; AVX2-NEXT:    vbroadcastss 996(%rdi), %xmm7
11738; AVX2-NEXT:    vmovaps 960(%rdi), %xmm4
11739; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
11740; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
11741; AVX2-NEXT:    vpermps %ymm8, %ymm15, %ymm7
11742; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
11743; AVX2-NEXT:    vbroadcastss 1108(%rdi), %ymm8
11744; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
11745; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
11746; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11747; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11748; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11749; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
11750; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
11751; AVX2-NEXT:    vbroadcastss 1220(%rdi), %xmm7
11752; AVX2-NEXT:    vmovaps 1184(%rdi), %xmm14
11753; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3]
11754; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
11755; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11756; AVX2-NEXT:    vpermps %ymm10, %ymm15, %ymm7
11757; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11758; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
11759; AVX2-NEXT:    vbroadcastss 1332(%rdi), %ymm8
11760; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
11761; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
11762; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11763; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11764; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11765; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
11766; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm7
11767; AVX2-NEXT:    vbroadcastss 1444(%rdi), %xmm8
11768; AVX2-NEXT:    vmovaps 1408(%rdi), %xmm0
11769; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3]
11770; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
11771; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload
11772; AVX2-NEXT:    vmovaps %ymm9, %ymm11
11773; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
11774; AVX2-NEXT:    vbroadcastss 1556(%rdi), %ymm9
11775; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
11776; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
11777; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11778; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11779; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11780; AVX2-NEXT:    # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7]
11781; AVX2-NEXT:    vpermps %ymm7, %ymm5, %ymm7
11782; AVX2-NEXT:    vbroadcastss 1668(%rdi), %xmm8
11783; AVX2-NEXT:    vmovaps 1632(%rdi), %xmm5
11784; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3]
11785; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
11786; AVX2-NEXT:    vpermps %ymm13, %ymm15, %ymm8
11787; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
11788; AVX2-NEXT:    # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
11789; AVX2-NEXT:    vbroadcastss 1780(%rdi), %ymm9
11790; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
11791; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
11792; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11793; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11794; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11795; AVX2-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
11796; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
11797; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
11798; AVX2-NEXT:    vbroadcastss 216(%rdi), %ymm8
11799; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
11800; AVX2-NEXT:    vmovaps 96(%rdi), %xmm9
11801; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
11802; AVX2-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2]
11803; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
11804; AVX2-NEXT:    # ymm8 = mem[1,0,2,3,5,4,6,7]
11805; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
11806; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
11807; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
11808; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11809; AVX2-NEXT:    vmovaps 320(%rdi), %xmm13
11810; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3]
11811; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
11812; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
11813; AVX2-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
11814; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
11815; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
11816; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11817; AVX2-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
11818; AVX2-NEXT:    # ymm6 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7]
11819; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7]
11820; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
11821; AVX2-NEXT:    vbroadcastss 440(%rdi), %ymm7
11822; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
11823; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
11824; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11825; AVX2-NEXT:    vmovaps 544(%rdi), %xmm8
11826; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm2[3]
11827; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
11828; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11829; AVX2-NEXT:    # ymm2 = mem[1,0,2,3,5,4,6,7]
11830; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
11831; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
11832; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11833; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11834; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
11835; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
11836; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
11837; AVX2-NEXT:    vbroadcastss 664(%rdi), %ymm6
11838; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7]
11839; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
11840; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11841; AVX2-NEXT:    vmovaps 768(%rdi), %xmm1
11842; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3]
11843; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
11844; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11845; AVX2-NEXT:    # ymm3 = mem[1,0,2,3,5,4,6,7]
11846; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
11847; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
11848; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11849; AVX2-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11850; AVX2-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7]
11851; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
11852; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
11853; AVX2-NEXT:    vbroadcastss 888(%rdi), %ymm6
11854; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
11855; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
11856; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11857; AVX2-NEXT:    vmovaps 992(%rdi), %xmm2
11858; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3]
11859; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
11860; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11861; AVX2-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
11862; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
11863; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
11864; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11865; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11866; AVX2-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
11867; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
11868; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
11869; AVX2-NEXT:    vbroadcastss 1112(%rdi), %ymm6
11870; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
11871; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7]
11872; AVX2-NEXT:    vmovaps 1216(%rdi), %xmm3
11873; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm14[3]
11874; AVX2-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
11875; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
11876; AVX2-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
11877; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
11878; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
11879; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7]
11880; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7]
11881; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
11882; AVX2-NEXT:    vbroadcastss 1336(%rdi), %ymm10
11883; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
11884; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
11885; AVX2-NEXT:    vmovaps 1440(%rdi), %xmm4
11886; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
11887; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
11888; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
11889; AVX2-NEXT:    # ymm10 = mem[1,0,2,3,5,4,6,7]
11890; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm10
11891; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
11892; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
11893; AVX2-NEXT:    # ymm10 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
11894; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7]
11895; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
11896; AVX2-NEXT:    vbroadcastss 1560(%rdi), %ymm12
11897; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
11898; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11899; AVX2-NEXT:    vmovaps 1664(%rdi), %xmm14
11900; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm5[3]
11901; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
11902; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11903; AVX2-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
11904; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
11905; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
11906; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11907; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
11908; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
11909; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
11910; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
11911; AVX2-NEXT:    vbroadcastss 1784(%rdi), %ymm12
11912; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7]
11913; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7]
11914; AVX2-NEXT:    vbroadcastss 136(%rdi), %xmm0
11915; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11916; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11917; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
11918; AVX2-NEXT:    vpermps 192(%rdi), %ymm15, %ymm5
11919; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
11920; AVX2-NEXT:    vbroadcastss 80(%rdi), %ymm5
11921; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3]
11922; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11923; AVX2-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
11924; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
11925; AVX2-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
11926; AVX2-NEXT:    vextractf128 $1, %ymm11, %xmm11
11927; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3]
11928; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7]
11929; AVX2-NEXT:    vbroadcastss 360(%rdi), %xmm0
11930; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11931; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11932; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
11933; AVX2-NEXT:    vpermps 416(%rdi), %ymm15, %ymm5
11934; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
11935; AVX2-NEXT:    vbroadcastss 304(%rdi), %ymm5
11936; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3]
11937; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11938; AVX2-NEXT:    # ymm13 = mem[2,3,2,3,6,7,6,7]
11939; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
11940; AVX2-NEXT:    # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
11941; AVX2-NEXT:    vextractf128 $1, %ymm13, %xmm13
11942; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3]
11943; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm0[4,5,6,7]
11944; AVX2-NEXT:    vbroadcastss 584(%rdi), %xmm0
11945; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11946; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11947; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
11948; AVX2-NEXT:    vpermps 640(%rdi), %ymm15, %ymm5
11949; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
11950; AVX2-NEXT:    vbroadcastss 528(%rdi), %ymm5
11951; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
11952; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
11953; AVX2-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
11954; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
11955; AVX2-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
11956; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
11957; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
11958; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7]
11959; AVX2-NEXT:    vbroadcastss 808(%rdi), %xmm0
11960; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11961; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11962; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
11963; AVX2-NEXT:    vpermps 864(%rdi), %ymm15, %ymm5
11964; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
11965; AVX2-NEXT:    vbroadcastss 752(%rdi), %ymm5
11966; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3]
11967; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11968; AVX2-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
11969; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
11970; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
11971; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
11972; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
11973; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11974; AVX2-NEXT:    vbroadcastss 1032(%rdi), %xmm1
11975; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11976; AVX2-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
11977; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
11978; AVX2-NEXT:    vpermps 1088(%rdi), %ymm15, %ymm5
11979; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
11980; AVX2-NEXT:    vbroadcastss 976(%rdi), %ymm5
11981; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
11982; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11983; AVX2-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
11984; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
11985; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
11986; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
11987; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
11988; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11989; AVX2-NEXT:    vbroadcastss 1256(%rdi), %xmm2
11990; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11991; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11992; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
11993; AVX2-NEXT:    vpermps 1312(%rdi), %ymm15, %ymm5
11994; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
11995; AVX2-NEXT:    vbroadcastss 1200(%rdi), %ymm5
11996; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
11997; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11998; AVX2-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
11999; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12000; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
12001; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
12002; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
12003; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
12004; AVX2-NEXT:    vbroadcastss 1480(%rdi), %xmm3
12005; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
12006; AVX2-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
12007; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
12008; AVX2-NEXT:    vpermps 1536(%rdi), %ymm15, %ymm5
12009; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
12010; AVX2-NEXT:    vbroadcastss 1424(%rdi), %ymm5
12011; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
12012; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12013; AVX2-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
12014; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12015; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
12016; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
12017; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12018; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
12019; AVX2-NEXT:    vbroadcastss 1704(%rdi), %xmm4
12020; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
12021; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
12022; AVX2-NEXT:    vpermps 1760(%rdi), %ymm15, %ymm5
12023; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
12024; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
12025; AVX2-NEXT:    vbroadcastss 1648(%rdi), %ymm5
12026; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3]
12027; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12028; AVX2-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
12029; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
12030; AVX2-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
12031; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm14
12032; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
12033; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
12034; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12035; AVX2-NEXT:    vmovaps %ymm5, 192(%rsi)
12036; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12037; AVX2-NEXT:    vmovaps %ymm5, 128(%rsi)
12038; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12039; AVX2-NEXT:    vmovaps %ymm5, 64(%rsi)
12040; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12041; AVX2-NEXT:    vmovaps %ymm5, (%rsi)
12042; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12043; AVX2-NEXT:    vmovaps %ymm5, 224(%rsi)
12044; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12045; AVX2-NEXT:    vmovaps %ymm5, 160(%rsi)
12046; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12047; AVX2-NEXT:    vmovaps %ymm5, 96(%rsi)
12048; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12049; AVX2-NEXT:    vmovaps %ymm5, 32(%rsi)
12050; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12051; AVX2-NEXT:    vmovaps %ymm5, 192(%rdx)
12052; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12053; AVX2-NEXT:    vmovaps %ymm5, 128(%rdx)
12054; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12055; AVX2-NEXT:    vmovaps %ymm5, 64(%rdx)
12056; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12057; AVX2-NEXT:    vmovaps %ymm5, (%rdx)
12058; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12059; AVX2-NEXT:    vmovaps %ymm5, 224(%rdx)
12060; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12061; AVX2-NEXT:    vmovaps %ymm5, 160(%rdx)
12062; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12063; AVX2-NEXT:    vmovaps %ymm5, 96(%rdx)
12064; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12065; AVX2-NEXT:    vmovaps %ymm5, 32(%rdx)
12066; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12067; AVX2-NEXT:    vmovaps %ymm5, 192(%rcx)
12068; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12069; AVX2-NEXT:    vmovaps %ymm5, 128(%rcx)
12070; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12071; AVX2-NEXT:    vmovaps %ymm5, 64(%rcx)
12072; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12073; AVX2-NEXT:    vmovaps %ymm5, (%rcx)
12074; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12075; AVX2-NEXT:    vmovaps %ymm5, 224(%rcx)
12076; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12077; AVX2-NEXT:    vmovaps %ymm5, 160(%rcx)
12078; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12079; AVX2-NEXT:    vmovaps %ymm5, 96(%rcx)
12080; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12081; AVX2-NEXT:    vmovaps %ymm5, 32(%rcx)
12082; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12083; AVX2-NEXT:    vmovaps %ymm5, (%r8)
12084; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12085; AVX2-NEXT:    vmovaps %ymm5, 64(%r8)
12086; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12087; AVX2-NEXT:    vmovaps %ymm5, 128(%r8)
12088; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12089; AVX2-NEXT:    vmovaps %ymm5, 192(%r8)
12090; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12091; AVX2-NEXT:    vmovaps %ymm5, 224(%r8)
12092; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12093; AVX2-NEXT:    vmovaps %ymm5, 160(%r8)
12094; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12095; AVX2-NEXT:    vmovaps %ymm5, 96(%r8)
12096; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12097; AVX2-NEXT:    vmovaps %ymm5, 32(%r8)
12098; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12099; AVX2-NEXT:    vmovaps %ymm5, 224(%r9)
12100; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12101; AVX2-NEXT:    vmovaps %ymm5, 192(%r9)
12102; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12103; AVX2-NEXT:    vmovaps %ymm5, 160(%r9)
12104; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12105; AVX2-NEXT:    vmovaps %ymm5, 128(%r9)
12106; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12107; AVX2-NEXT:    vmovaps %ymm5, 96(%r9)
12108; AVX2-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
12109; AVX2-NEXT:    vmovaps %ymm5, 64(%r9)
12110; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12111; AVX2-NEXT:    vmovaps %ymm5, 32(%r9)
12112; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12113; AVX2-NEXT:    vmovaps %ymm5, (%r9)
12114; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12115; AVX2-NEXT:    vmovaps %ymm12, 224(%rax)
12116; AVX2-NEXT:    vmovaps %ymm10, 192(%rax)
12117; AVX2-NEXT:    vmovaps %ymm6, 160(%rax)
12118; AVX2-NEXT:    vmovaps %ymm7, 128(%rax)
12119; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12120; AVX2-NEXT:    vmovaps %ymm5, 96(%rax)
12121; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12122; AVX2-NEXT:    vmovaps %ymm5, 64(%rax)
12123; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12124; AVX2-NEXT:    vmovaps %ymm5, 32(%rax)
12125; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12126; AVX2-NEXT:    vmovaps %ymm5, (%rax)
12127; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12128; AVX2-NEXT:    vmovaps %ymm4, 224(%rax)
12129; AVX2-NEXT:    vmovaps %ymm3, 192(%rax)
12130; AVX2-NEXT:    vmovaps %ymm2, 160(%rax)
12131; AVX2-NEXT:    vmovaps %ymm1, 128(%rax)
12132; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
12133; AVX2-NEXT:    vmovaps %ymm8, 64(%rax)
12134; AVX2-NEXT:    vmovaps %ymm13, 32(%rax)
12135; AVX2-NEXT:    vmovaps %ymm11, (%rax)
12136; AVX2-NEXT:    addq $2648, %rsp # imm = 0xA58
12137; AVX2-NEXT:    vzeroupper
12138; AVX2-NEXT:    retq
12139;
12140; AVX2-FP-LABEL: load_i32_stride7_vf64:
12141; AVX2-FP:       # %bb.0:
12142; AVX2-FP-NEXT:    subq $2648, %rsp # imm = 0xA58
12143; AVX2-FP-NEXT:    vmovdqa 1216(%rdi), %ymm9
12144; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12145; AVX2-FP-NEXT:    vmovdqa 1152(%rdi), %ymm4
12146; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12147; AVX2-FP-NEXT:    vmovdqa 1120(%rdi), %ymm5
12148; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm12
12149; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12150; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm6
12151; AVX2-FP-NEXT:    vmovdqa 672(%rdi), %ymm7
12152; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm8
12153; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12154; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm10
12155; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm11
12156; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
12157; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7]
12158; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12159; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12160; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
12161; AVX2-FP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
12162; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
12163; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
12164; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %xmm2
12165; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %xmm3
12166; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12167; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
12168; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12169; AVX2-FP-NEXT:    vpbroadcastd 420(%rdi), %ymm3
12170; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12171; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
12172; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12173; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
12174; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12175; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm8
12176; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12177; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
12178; AVX2-FP-NEXT:    vpbroadcastq 752(%rdi), %ymm2
12179; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
12180; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
12181; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %xmm2
12182; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %xmm3
12183; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12184; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
12185; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12186; AVX2-FP-NEXT:    vpbroadcastd 868(%rdi), %ymm3
12187; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12188; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
12189; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12190; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
12191; AVX2-FP-NEXT:    vmovdqa %ymm5, %ymm6
12192; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12193; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
12194; AVX2-FP-NEXT:    vpbroadcastq 1200(%rdi), %ymm2
12195; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
12196; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
12197; AVX2-FP-NEXT:    vmovdqa 1248(%rdi), %xmm2
12198; AVX2-FP-NEXT:    vmovdqa 1280(%rdi), %xmm3
12199; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12200; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
12201; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12202; AVX2-FP-NEXT:    vpbroadcastd 1316(%rdi), %ymm3
12203; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12204; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
12205; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12206; AVX2-FP-NEXT:    vmovdqa 1600(%rdi), %ymm13
12207; AVX2-FP-NEXT:    vmovdqa 1568(%rdi), %ymm5
12208; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7]
12209; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12210; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12211; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
12212; AVX2-FP-NEXT:    vmovdqa 1664(%rdi), %ymm3
12213; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12214; AVX2-FP-NEXT:    vpbroadcastq 1648(%rdi), %ymm2
12215; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12216; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
12217; AVX2-FP-NEXT:    vmovdqa 1696(%rdi), %xmm2
12218; AVX2-FP-NEXT:    vmovdqa 1728(%rdi), %xmm3
12219; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12220; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
12221; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12222; AVX2-FP-NEXT:    vpbroadcastd 1764(%rdi), %ymm3
12223; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12224; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
12225; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12226; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm2
12227; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12228; AVX2-FP-NEXT:    vpbroadcastq 80(%rdi), %ymm1
12229; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12230; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
12231; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12232; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm3
12233; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12234; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
12235; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
12236; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
12237; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm2
12238; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm3
12239; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12240; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
12241; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12242; AVX2-FP-NEXT:    vpbroadcastd 196(%rdi), %ymm3
12243; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12244; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
12245; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12246; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm2
12247; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12248; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm1
12249; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12250; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
12251; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
12252; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm3
12253; AVX2-FP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
12254; AVX2-FP-NEXT:    vpbroadcastq 528(%rdi), %ymm2
12255; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12256; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
12257; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %xmm2
12258; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %xmm3
12259; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12260; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
12261; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12262; AVX2-FP-NEXT:    vpbroadcastd 644(%rdi), %ymm3
12263; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12264; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
12265; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12266; AVX2-FP-NEXT:    vmovdqa 928(%rdi), %ymm2
12267; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12268; AVX2-FP-NEXT:    vmovdqa 896(%rdi), %ymm1
12269; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12270; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
12271; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
12272; AVX2-FP-NEXT:    vmovdqa 992(%rdi), %ymm3
12273; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12274; AVX2-FP-NEXT:    vpbroadcastq 976(%rdi), %ymm2
12275; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12276; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
12277; AVX2-FP-NEXT:    vmovdqa 1024(%rdi), %xmm2
12278; AVX2-FP-NEXT:    vmovdqa 1056(%rdi), %xmm3
12279; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12280; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
12281; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12282; AVX2-FP-NEXT:    vpbroadcastd 1092(%rdi), %ymm3
12283; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12284; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
12285; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12286; AVX2-FP-NEXT:    vmovdqa 1376(%rdi), %ymm14
12287; AVX2-FP-NEXT:    vmovdqa 1344(%rdi), %ymm15
12288; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7]
12289; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12290; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12291; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
12292; AVX2-FP-NEXT:    vmovdqa 1440(%rdi), %ymm4
12293; AVX2-FP-NEXT:    vpbroadcastq 1424(%rdi), %ymm1
12294; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
12295; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12296; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
12297; AVX2-FP-NEXT:    vmovdqa 1472(%rdi), %xmm1
12298; AVX2-FP-NEXT:    vmovdqa 1504(%rdi), %xmm2
12299; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12300; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
12301; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12302; AVX2-FP-NEXT:    vpbroadcastd 1540(%rdi), %ymm2
12303; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12304; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12305; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12306; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
12307; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
12308; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm1
12309; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12310; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm0
12311; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12312; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
12313; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
12314; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
12315; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm12
12316; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12317; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7]
12318; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12319; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
12320; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12321; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
12322; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
12323; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
12324; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
12325; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12326; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
12327; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12328; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm3
12329; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12330; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm2
12331; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12332; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
12333; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
12334; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12335; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %ymm2
12336; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12337; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12338; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
12339; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
12340; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12341; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
12342; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
12343; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
12344; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12345; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
12346; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12347; AVX2-FP-NEXT:    vmovdqa 1280(%rdi), %ymm3
12348; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12349; AVX2-FP-NEXT:    vmovdqa 1248(%rdi), %ymm2
12350; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12351; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
12352; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
12353; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12354; AVX2-FP-NEXT:    vmovdqa 1184(%rdi), %ymm2
12355; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12356; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12357; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
12358; AVX2-FP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
12359; AVX2-FP-NEXT:    # ymm7 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7]
12360; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12361; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7]
12362; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
12363; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
12364; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12365; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
12366; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12367; AVX2-FP-NEXT:    vmovdqa 1728(%rdi), %ymm3
12368; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12369; AVX2-FP-NEXT:    vmovdqa 1696(%rdi), %ymm2
12370; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12371; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
12372; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
12373; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12374; AVX2-FP-NEXT:    vmovdqa 1632(%rdi), %ymm2
12375; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12376; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12377; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7]
12378; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
12379; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12380; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7]
12381; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
12382; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
12383; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12384; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
12385; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12386; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %ymm3
12387; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12388; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm2
12389; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12390; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
12391; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
12392; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12393; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm2
12394; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12395; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
12396; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7]
12397; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12398; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12399; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
12400; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12401; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
12402; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
12403; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
12404; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12405; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
12406; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12407; AVX2-FP-NEXT:    vmovdqa 1056(%rdi), %ymm3
12408; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12409; AVX2-FP-NEXT:    vmovdqa 1024(%rdi), %ymm2
12410; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12411; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
12412; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
12413; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12414; AVX2-FP-NEXT:    vmovdqa 960(%rdi), %ymm2
12415; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12416; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12417; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
12418; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12419; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12420; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
12421; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12422; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
12423; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
12424; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
12425; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12426; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
12427; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12428; AVX2-FP-NEXT:    vmovdqa 1504(%rdi), %ymm3
12429; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12430; AVX2-FP-NEXT:    vmovdqa 1472(%rdi), %ymm2
12431; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12432; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
12433; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
12434; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12435; AVX2-FP-NEXT:    vmovdqa 1408(%rdi), %ymm2
12436; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12437; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
12438; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
12439; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12440; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
12441; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
12442; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
12443; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12444; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
12445; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12446; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm15
12447; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm14
12448; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27]
12449; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12450; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12451; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
12452; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12453; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm4
12454; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12455; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7]
12456; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12457; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12458; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12459; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
12460; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12461; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
12462; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm0
12463; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12464; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12465; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm0
12466; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
12467; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
12468; AVX2-FP-NEXT:    vpbroadcastd 232(%rdi), %xmm1
12469; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm5
12470; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
12471; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12472; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12473; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12474; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
12475; AVX2-FP-NEXT:    vpbroadcastd 428(%rdi), %ymm2
12476; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12477; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12478; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12479; AVX2-FP-NEXT:    vmovdqa 752(%rdi), %xmm0
12480; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12481; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
12482; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
12483; AVX2-FP-NEXT:    vpbroadcastd 680(%rdi), %xmm1
12484; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %xmm2
12485; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12486; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
12487; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12488; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12489; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12490; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
12491; AVX2-FP-NEXT:    vpbroadcastd 876(%rdi), %ymm2
12492; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12493; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12494; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12495; AVX2-FP-NEXT:    vmovdqa 1200(%rdi), %xmm0
12496; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12497; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
12498; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
12499; AVX2-FP-NEXT:    vpbroadcastd 1128(%rdi), %xmm1
12500; AVX2-FP-NEXT:    vmovdqa 1152(%rdi), %xmm2
12501; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12502; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
12503; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12504; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12505; AVX2-FP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12506; AVX2-FP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
12507; AVX2-FP-NEXT:    vpbroadcastd 1324(%rdi), %ymm2
12508; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12509; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12510; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12511; AVX2-FP-NEXT:    vmovdqa 1648(%rdi), %xmm0
12512; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12513; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
12514; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
12515; AVX2-FP-NEXT:    vpbroadcastd 1576(%rdi), %xmm1
12516; AVX2-FP-NEXT:    vmovdqa 1600(%rdi), %xmm2
12517; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
12518; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12519; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12520; AVX2-FP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12521; AVX2-FP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
12522; AVX2-FP-NEXT:    vpbroadcastd 1772(%rdi), %ymm6
12523; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
12524; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12525; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12526; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm0
12527; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
12528; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
12529; AVX2-FP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
12530; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm4
12531; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12532; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
12533; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12534; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
12535; AVX2-FP-NEXT:    vpbroadcastd 204(%rdi), %ymm6
12536; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
12537; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12538; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12539; AVX2-FP-NEXT:    vmovdqa 528(%rdi), %xmm0
12540; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12541; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
12542; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
12543; AVX2-FP-NEXT:    vpbroadcastd 456(%rdi), %xmm1
12544; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %xmm4
12545; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12546; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
12547; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12548; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12549; AVX2-FP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12550; AVX2-FP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
12551; AVX2-FP-NEXT:    vpbroadcastd 652(%rdi), %ymm15
12552; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
12553; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12554; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12555; AVX2-FP-NEXT:    vmovdqa 976(%rdi), %xmm0
12556; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12557; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
12558; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
12559; AVX2-FP-NEXT:    vpbroadcastd 904(%rdi), %xmm15
12560; AVX2-FP-NEXT:    vmovdqa 928(%rdi), %xmm11
12561; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3]
12562; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
12563; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12564; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12565; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2]
12566; AVX2-FP-NEXT:    vpbroadcastd 1100(%rdi), %ymm14
12567; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
12568; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7]
12569; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12570; AVX2-FP-NEXT:    vmovdqa 1424(%rdi), %xmm0
12571; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12572; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12573; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
12574; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7]
12575; AVX2-FP-NEXT:    vpbroadcastd 1352(%rdi), %xmm15
12576; AVX2-FP-NEXT:    vmovdqa 1376(%rdi), %xmm0
12577; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
12578; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
12579; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12580; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12581; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
12582; AVX2-FP-NEXT:    vpbroadcastd 1548(%rdi), %ymm13
12583; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
12584; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7]
12585; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12586; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12587; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
12588; AVX2-FP-NEXT:    # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
12589; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3]
12590; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3]
12591; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
12592; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
12593; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12594; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7]
12595; AVX2-FP-NEXT:    vmovaps %ymm4, %ymm12
12596; AVX2-FP-NEXT:    vbroadcastss 432(%rdi), %ymm14
12597; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
12598; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7]
12599; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12600; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12601; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
12602; AVX2-FP-NEXT:    # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
12603; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12604; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3]
12605; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
12606; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
12607; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
12608; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12609; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7]
12610; AVX2-FP-NEXT:    vbroadcastss 880(%rdi), %ymm13
12611; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7]
12612; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7]
12613; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12614; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12615; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
12616; AVX2-FP-NEXT:    # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
12617; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12618; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
12619; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
12620; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
12621; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
12622; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12623; AVX2-FP-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
12624; AVX2-FP-NEXT:    # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7]
12625; AVX2-FP-NEXT:    vbroadcastss 1328(%rdi), %ymm5
12626; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
12627; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
12628; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12629; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12630; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12631; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
12632; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
12633; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
12634; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4]
12635; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
12636; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12637; AVX2-FP-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
12638; AVX2-FP-NEXT:    # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7]
12639; AVX2-FP-NEXT:    vbroadcastss 1776(%rdi), %ymm4
12640; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
12641; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
12642; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12643; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7]
12644; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
12645; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
12646; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4]
12647; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
12648; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7]
12649; AVX2-FP-NEXT:    vmovaps %ymm1, %ymm9
12650; AVX2-FP-NEXT:    vbroadcastss 1552(%rdi), %ymm3
12651; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12652; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
12653; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12654; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12655; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12656; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
12657; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3]
12658; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
12659; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
12660; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12661; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm8
12662; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7]
12663; AVX2-FP-NEXT:    vmovaps %ymm7, %ymm11
12664; AVX2-FP-NEXT:    vbroadcastss 1104(%rdi), %ymm2
12665; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12666; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12667; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12668; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
12669; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12670; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
12671; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12672; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
12673; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
12674; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
12675; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12676; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12677; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12678; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7]
12679; AVX2-FP-NEXT:    vbroadcastss 656(%rdi), %ymm2
12680; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12681; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12682; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12683; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12684; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12685; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
12686; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12687; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
12688; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
12689; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
12690; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
12691; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12692; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12693; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7]
12694; AVX2-FP-NEXT:    vbroadcastss 208(%rdi), %ymm2
12695; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12696; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
12697; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12698; AVX2-FP-NEXT:    vbroadcastss 100(%rdi), %xmm0
12699; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm6
12700; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
12701; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm5 = [4,3,0,0]
12702; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12703; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12704; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
12705; AVX2-FP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
12706; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12707; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7]
12708; AVX2-FP-NEXT:    vpermps %ymm3, %ymm15, %ymm1
12709; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7]
12710; AVX2-FP-NEXT:    vbroadcastss 212(%rdi), %ymm2
12711; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12712; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12713; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12714; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12715; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12716; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
12717; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
12718; AVX2-FP-NEXT:    vbroadcastss 324(%rdi), %xmm2
12719; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm1
12720; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
12721; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
12722; AVX2-FP-NEXT:    vpermps %ymm12, %ymm15, %ymm2
12723; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12724; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
12725; AVX2-FP-NEXT:    vbroadcastss 436(%rdi), %ymm3
12726; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12727; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
12728; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12729; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12730; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12731; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
12732; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
12733; AVX2-FP-NEXT:    vbroadcastss 548(%rdi), %xmm3
12734; AVX2-FP-NEXT:    vmovaps 512(%rdi), %xmm2
12735; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
12736; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
12737; AVX2-FP-NEXT:    vpermps %ymm4, %ymm15, %ymm3
12738; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
12739; AVX2-FP-NEXT:    vbroadcastss 660(%rdi), %ymm4
12740; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
12741; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
12742; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
12743; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12744; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12745; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
12746; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
12747; AVX2-FP-NEXT:    vbroadcastss 772(%rdi), %xmm4
12748; AVX2-FP-NEXT:    vmovaps 736(%rdi), %xmm3
12749; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
12750; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
12751; AVX2-FP-NEXT:    vpermps %ymm14, %ymm15, %ymm4
12752; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
12753; AVX2-FP-NEXT:    # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
12754; AVX2-FP-NEXT:    vbroadcastss 884(%rdi), %ymm7
12755; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
12756; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
12757; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12758; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12759; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12760; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
12761; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
12762; AVX2-FP-NEXT:    vbroadcastss 996(%rdi), %xmm7
12763; AVX2-FP-NEXT:    vmovaps 960(%rdi), %xmm4
12764; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
12765; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
12766; AVX2-FP-NEXT:    vpermps %ymm8, %ymm15, %ymm7
12767; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
12768; AVX2-FP-NEXT:    vbroadcastss 1108(%rdi), %ymm8
12769; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
12770; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
12771; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12772; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12773; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12774; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
12775; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
12776; AVX2-FP-NEXT:    vbroadcastss 1220(%rdi), %xmm7
12777; AVX2-FP-NEXT:    vmovaps 1184(%rdi), %xmm14
12778; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3]
12779; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
12780; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12781; AVX2-FP-NEXT:    vpermps %ymm10, %ymm15, %ymm7
12782; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12783; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
12784; AVX2-FP-NEXT:    vbroadcastss 1332(%rdi), %ymm8
12785; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
12786; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
12787; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12788; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12789; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12790; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
12791; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm7
12792; AVX2-FP-NEXT:    vbroadcastss 1444(%rdi), %xmm8
12793; AVX2-FP-NEXT:    vmovaps 1408(%rdi), %xmm0
12794; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3]
12795; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
12796; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload
12797; AVX2-FP-NEXT:    vmovaps %ymm9, %ymm11
12798; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
12799; AVX2-FP-NEXT:    vbroadcastss 1556(%rdi), %ymm9
12800; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
12801; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
12802; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12803; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12804; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
12805; AVX2-FP-NEXT:    # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7]
12806; AVX2-FP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
12807; AVX2-FP-NEXT:    vbroadcastss 1668(%rdi), %xmm8
12808; AVX2-FP-NEXT:    vmovaps 1632(%rdi), %xmm5
12809; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3]
12810; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
12811; AVX2-FP-NEXT:    vpermps %ymm13, %ymm15, %ymm8
12812; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
12813; AVX2-FP-NEXT:    # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
12814; AVX2-FP-NEXT:    vbroadcastss 1780(%rdi), %ymm9
12815; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
12816; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
12817; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12818; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12819; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
12820; AVX2-FP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
12821; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
12822; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
12823; AVX2-FP-NEXT:    vbroadcastss 216(%rdi), %ymm8
12824; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
12825; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm9
12826; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
12827; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2]
12828; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
12829; AVX2-FP-NEXT:    # ymm8 = mem[1,0,2,3,5,4,6,7]
12830; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
12831; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
12832; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
12833; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12834; AVX2-FP-NEXT:    vmovaps 320(%rdi), %xmm13
12835; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3]
12836; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
12837; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
12838; AVX2-FP-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
12839; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
12840; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
12841; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12842; AVX2-FP-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
12843; AVX2-FP-NEXT:    # ymm6 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7]
12844; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7]
12845; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
12846; AVX2-FP-NEXT:    vbroadcastss 440(%rdi), %ymm7
12847; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
12848; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
12849; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12850; AVX2-FP-NEXT:    vmovaps 544(%rdi), %xmm8
12851; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm2[3]
12852; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
12853; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12854; AVX2-FP-NEXT:    # ymm2 = mem[1,0,2,3,5,4,6,7]
12855; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
12856; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12857; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12858; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12859; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
12860; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
12861; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
12862; AVX2-FP-NEXT:    vbroadcastss 664(%rdi), %ymm6
12863; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7]
12864; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12865; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12866; AVX2-FP-NEXT:    vmovaps 768(%rdi), %xmm1
12867; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3]
12868; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
12869; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12870; AVX2-FP-NEXT:    # ymm3 = mem[1,0,2,3,5,4,6,7]
12871; AVX2-FP-NEXT:    vextractf128 $1, %ymm3, %xmm3
12872; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
12873; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12874; AVX2-FP-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12875; AVX2-FP-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7]
12876; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
12877; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
12878; AVX2-FP-NEXT:    vbroadcastss 888(%rdi), %ymm6
12879; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
12880; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12881; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12882; AVX2-FP-NEXT:    vmovaps 992(%rdi), %xmm2
12883; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3]
12884; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
12885; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12886; AVX2-FP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
12887; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
12888; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
12889; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12890; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
12891; AVX2-FP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
12892; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
12893; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
12894; AVX2-FP-NEXT:    vbroadcastss 1112(%rdi), %ymm6
12895; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
12896; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7]
12897; AVX2-FP-NEXT:    vmovaps 1216(%rdi), %xmm3
12898; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm14[3]
12899; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
12900; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
12901; AVX2-FP-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
12902; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
12903; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
12904; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7]
12905; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7]
12906; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
12907; AVX2-FP-NEXT:    vbroadcastss 1336(%rdi), %ymm10
12908; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
12909; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
12910; AVX2-FP-NEXT:    vmovaps 1440(%rdi), %xmm4
12911; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
12912; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
12913; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
12914; AVX2-FP-NEXT:    # ymm10 = mem[1,0,2,3,5,4,6,7]
12915; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm10
12916; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
12917; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
12918; AVX2-FP-NEXT:    # ymm10 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
12919; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7]
12920; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
12921; AVX2-FP-NEXT:    vbroadcastss 1560(%rdi), %ymm12
12922; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
12923; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm10[4,5,6,7]
12924; AVX2-FP-NEXT:    vmovaps 1664(%rdi), %xmm14
12925; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm5[3]
12926; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
12927; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12928; AVX2-FP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
12929; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
12930; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
12931; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12932; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12933; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
12934; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
12935; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
12936; AVX2-FP-NEXT:    vbroadcastss 1784(%rdi), %ymm12
12937; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7]
12938; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7]
12939; AVX2-FP-NEXT:    vbroadcastss 136(%rdi), %xmm0
12940; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12941; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12942; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12943; AVX2-FP-NEXT:    vpermps 192(%rdi), %ymm15, %ymm5
12944; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
12945; AVX2-FP-NEXT:    vbroadcastss 80(%rdi), %ymm5
12946; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3]
12947; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
12948; AVX2-FP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
12949; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
12950; AVX2-FP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
12951; AVX2-FP-NEXT:    vextractf128 $1, %ymm11, %xmm11
12952; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3]
12953; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12954; AVX2-FP-NEXT:    vbroadcastss 360(%rdi), %xmm0
12955; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12956; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12957; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12958; AVX2-FP-NEXT:    vpermps 416(%rdi), %ymm15, %ymm5
12959; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
12960; AVX2-FP-NEXT:    vbroadcastss 304(%rdi), %ymm5
12961; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3]
12962; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12963; AVX2-FP-NEXT:    # ymm13 = mem[2,3,2,3,6,7,6,7]
12964; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
12965; AVX2-FP-NEXT:    # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
12966; AVX2-FP-NEXT:    vextractf128 $1, %ymm13, %xmm13
12967; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3]
12968; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12969; AVX2-FP-NEXT:    vbroadcastss 584(%rdi), %xmm0
12970; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12971; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12972; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12973; AVX2-FP-NEXT:    vpermps 640(%rdi), %ymm15, %ymm5
12974; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
12975; AVX2-FP-NEXT:    vbroadcastss 528(%rdi), %ymm5
12976; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
12977; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
12978; AVX2-FP-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
12979; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
12980; AVX2-FP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
12981; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
12982; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
12983; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12984; AVX2-FP-NEXT:    vbroadcastss 808(%rdi), %xmm0
12985; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12986; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12987; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
12988; AVX2-FP-NEXT:    vpermps 864(%rdi), %ymm15, %ymm5
12989; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
12990; AVX2-FP-NEXT:    vbroadcastss 752(%rdi), %ymm5
12991; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3]
12992; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12993; AVX2-FP-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
12994; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12995; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
12996; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
12997; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
12998; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12999; AVX2-FP-NEXT:    vbroadcastss 1032(%rdi), %xmm1
13000; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13001; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
13002; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
13003; AVX2-FP-NEXT:    vpermps 1088(%rdi), %ymm15, %ymm5
13004; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
13005; AVX2-FP-NEXT:    vbroadcastss 976(%rdi), %ymm5
13006; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
13007; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13008; AVX2-FP-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
13009; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
13010; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
13011; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
13012; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
13013; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13014; AVX2-FP-NEXT:    vbroadcastss 1256(%rdi), %xmm2
13015; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13016; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13017; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
13018; AVX2-FP-NEXT:    vpermps 1312(%rdi), %ymm15, %ymm5
13019; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
13020; AVX2-FP-NEXT:    vbroadcastss 1200(%rdi), %ymm5
13021; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
13022; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13023; AVX2-FP-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
13024; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
13025; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
13026; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
13027; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
13028; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
13029; AVX2-FP-NEXT:    vbroadcastss 1480(%rdi), %xmm3
13030; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
13031; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
13032; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
13033; AVX2-FP-NEXT:    vpermps 1536(%rdi), %ymm15, %ymm5
13034; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
13035; AVX2-FP-NEXT:    vbroadcastss 1424(%rdi), %ymm5
13036; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
13037; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13038; AVX2-FP-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
13039; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
13040; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
13041; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
13042; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
13043; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
13044; AVX2-FP-NEXT:    vbroadcastss 1704(%rdi), %xmm4
13045; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
13046; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
13047; AVX2-FP-NEXT:    vpermps 1760(%rdi), %ymm15, %ymm5
13048; AVX2-FP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
13049; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
13050; AVX2-FP-NEXT:    vbroadcastss 1648(%rdi), %ymm5
13051; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3]
13052; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
13053; AVX2-FP-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
13054; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
13055; AVX2-FP-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
13056; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm14
13057; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
13058; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
13059; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13060; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rsi)
13061; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13062; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%rsi)
13063; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13064; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rsi)
13065; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13066; AVX2-FP-NEXT:    vmovaps %ymm5, (%rsi)
13067; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13068; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%rsi)
13069; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13070; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%rsi)
13071; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13072; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rsi)
13073; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13074; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rsi)
13075; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13076; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rdx)
13077; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13078; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%rdx)
13079; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13080; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rdx)
13081; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13082; AVX2-FP-NEXT:    vmovaps %ymm5, (%rdx)
13083; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13084; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%rdx)
13085; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13086; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%rdx)
13087; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13088; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rdx)
13089; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13090; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rdx)
13091; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13092; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rcx)
13093; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13094; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%rcx)
13095; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13096; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rcx)
13097; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13098; AVX2-FP-NEXT:    vmovaps %ymm5, (%rcx)
13099; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13100; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%rcx)
13101; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13102; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%rcx)
13103; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13104; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rcx)
13105; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13106; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rcx)
13107; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13108; AVX2-FP-NEXT:    vmovaps %ymm5, (%r8)
13109; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13110; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%r8)
13111; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13112; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%r8)
13113; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13114; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%r8)
13115; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13116; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%r8)
13117; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13118; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%r8)
13119; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13120; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%r8)
13121; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13122; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%r8)
13123; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13124; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%r9)
13125; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13126; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%r9)
13127; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13128; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%r9)
13129; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13130; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%r9)
13131; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13132; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%r9)
13133; AVX2-FP-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
13134; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%r9)
13135; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13136; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%r9)
13137; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13138; AVX2-FP-NEXT:    vmovaps %ymm5, (%r9)
13139; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13140; AVX2-FP-NEXT:    vmovaps %ymm12, 224(%rax)
13141; AVX2-FP-NEXT:    vmovaps %ymm10, 192(%rax)
13142; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%rax)
13143; AVX2-FP-NEXT:    vmovaps %ymm7, 128(%rax)
13144; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13145; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rax)
13146; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13147; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rax)
13148; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13149; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rax)
13150; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13151; AVX2-FP-NEXT:    vmovaps %ymm5, (%rax)
13152; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13153; AVX2-FP-NEXT:    vmovaps %ymm4, 224(%rax)
13154; AVX2-FP-NEXT:    vmovaps %ymm3, 192(%rax)
13155; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%rax)
13156; AVX2-FP-NEXT:    vmovaps %ymm1, 128(%rax)
13157; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
13158; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rax)
13159; AVX2-FP-NEXT:    vmovaps %ymm13, 32(%rax)
13160; AVX2-FP-NEXT:    vmovaps %ymm11, (%rax)
13161; AVX2-FP-NEXT:    addq $2648, %rsp # imm = 0xA58
13162; AVX2-FP-NEXT:    vzeroupper
13163; AVX2-FP-NEXT:    retq
13164;
13165; AVX2-FCP-LABEL: load_i32_stride7_vf64:
13166; AVX2-FCP:       # %bb.0:
13167; AVX2-FCP-NEXT:    subq $2648, %rsp # imm = 0xA58
13168; AVX2-FCP-NEXT:    vmovdqa 1216(%rdi), %ymm9
13169; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13170; AVX2-FCP-NEXT:    vmovdqa 1152(%rdi), %ymm4
13171; AVX2-FCP-NEXT:    vmovdqa 1120(%rdi), %ymm5
13172; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13173; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm13
13174; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm6
13175; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm7
13176; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm8
13177; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13178; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm10
13179; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm11
13180; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
13181; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7]
13182; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13183; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13184; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
13185; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
13186; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
13187; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
13188; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
13189; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm3
13190; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13191; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
13192; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13193; AVX2-FCP-NEXT:    vpbroadcastd 420(%rdi), %ymm3
13194; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13195; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
13196; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13197; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
13198; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13199; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm8
13200; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13201; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
13202; AVX2-FCP-NEXT:    vpbroadcastq 752(%rdi), %ymm2
13203; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7]
13204; AVX2-FCP-NEXT:    vmovdqa %ymm13, %ymm6
13205; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
13206; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %xmm2
13207; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %xmm3
13208; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13209; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
13210; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13211; AVX2-FCP-NEXT:    vpbroadcastd 868(%rdi), %ymm3
13212; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13213; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
13214; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13215; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
13216; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13217; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
13218; AVX2-FCP-NEXT:    vpbroadcastq 1200(%rdi), %ymm2
13219; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
13220; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
13221; AVX2-FCP-NEXT:    vmovdqa 1248(%rdi), %xmm2
13222; AVX2-FCP-NEXT:    vmovdqa 1280(%rdi), %xmm3
13223; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13224; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
13225; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13226; AVX2-FCP-NEXT:    vpbroadcastd 1316(%rdi), %ymm3
13227; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13228; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
13229; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13230; AVX2-FCP-NEXT:    vmovdqa 1600(%rdi), %ymm1
13231; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13232; AVX2-FCP-NEXT:    vmovdqa 1568(%rdi), %ymm5
13233; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6],ymm5[7]
13234; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13235; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
13236; AVX2-FCP-NEXT:    vmovdqa 1664(%rdi), %ymm3
13237; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13238; AVX2-FCP-NEXT:    vpbroadcastq 1648(%rdi), %ymm2
13239; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
13240; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
13241; AVX2-FCP-NEXT:    vmovdqa 1696(%rdi), %xmm2
13242; AVX2-FCP-NEXT:    vmovdqa 1728(%rdi), %xmm3
13243; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13244; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
13245; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13246; AVX2-FCP-NEXT:    vpbroadcastd 1764(%rdi), %ymm3
13247; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13248; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
13249; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13250; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm2
13251; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13252; AVX2-FCP-NEXT:    vpbroadcastq 80(%rdi), %ymm1
13253; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13254; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
13255; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13256; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
13257; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13258; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
13259; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
13260; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
13261; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm2
13262; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm3
13263; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13264; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
13265; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13266; AVX2-FCP-NEXT:    vpbroadcastd 196(%rdi), %ymm3
13267; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13268; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
13269; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13270; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm2
13271; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13272; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
13273; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13274; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
13275; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
13276; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm3
13277; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13278; AVX2-FCP-NEXT:    vpbroadcastq 528(%rdi), %ymm2
13279; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
13280; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
13281; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %xmm2
13282; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %xmm3
13283; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13284; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
13285; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13286; AVX2-FCP-NEXT:    vpbroadcastd 644(%rdi), %ymm3
13287; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13288; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
13289; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13290; AVX2-FCP-NEXT:    vmovdqa 928(%rdi), %ymm2
13291; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13292; AVX2-FCP-NEXT:    vmovdqa 896(%rdi), %ymm1
13293; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13294; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
13295; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
13296; AVX2-FCP-NEXT:    vmovdqa 992(%rdi), %ymm12
13297; AVX2-FCP-NEXT:    vpbroadcastq 976(%rdi), %ymm2
13298; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
13299; AVX2-FCP-NEXT:    vmovdqu %ymm12, (%rsp) # 32-byte Spill
13300; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
13301; AVX2-FCP-NEXT:    vmovdqa 1024(%rdi), %xmm2
13302; AVX2-FCP-NEXT:    vmovdqa 1056(%rdi), %xmm3
13303; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13304; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
13305; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13306; AVX2-FCP-NEXT:    vpbroadcastd 1092(%rdi), %ymm3
13307; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13308; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
13309; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13310; AVX2-FCP-NEXT:    vmovdqa 1376(%rdi), %ymm15
13311; AVX2-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm14
13312; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7]
13313; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13314; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13315; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
13316; AVX2-FCP-NEXT:    vmovdqa 1440(%rdi), %ymm9
13317; AVX2-FCP-NEXT:    vpbroadcastq 1424(%rdi), %ymm1
13318; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
13319; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13320; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
13321; AVX2-FCP-NEXT:    vmovdqa 1472(%rdi), %xmm1
13322; AVX2-FCP-NEXT:    vmovdqa 1504(%rdi), %xmm2
13323; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13324; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
13325; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
13326; AVX2-FCP-NEXT:    vpbroadcastd 1540(%rdi), %ymm2
13327; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13328; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13329; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13330; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
13331; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
13332; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm1
13333; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13334; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
13335; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13336; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
13337; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
13338; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
13339; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm0
13340; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13341; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13342; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
13343; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
13344; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13345; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
13346; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
13347; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
13348; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
13349; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13350; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
13351; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
13352; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm2
13353; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13354; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm3
13355; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13356; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
13357; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
13358; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13359; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm2
13360; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13361; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13362; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7]
13363; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
13364; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13365; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
13366; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
13367; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
13368; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13369; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
13370; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
13371; AVX2-FCP-NEXT:    vmovdqa 1280(%rdi), %ymm3
13372; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13373; AVX2-FCP-NEXT:    vmovdqa 1248(%rdi), %ymm2
13374; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13375; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
13376; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
13377; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13378; AVX2-FCP-NEXT:    vmovdqa 1184(%rdi), %ymm2
13379; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13380; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13381; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7]
13382; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
13383; AVX2-FCP-NEXT:    # ymm7 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
13384; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13385; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7]
13386; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
13387; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
13388; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13389; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
13390; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
13391; AVX2-FCP-NEXT:    vmovdqa 1728(%rdi), %ymm3
13392; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13393; AVX2-FCP-NEXT:    vmovdqa 1696(%rdi), %ymm2
13394; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13395; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
13396; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
13397; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13398; AVX2-FCP-NEXT:    vmovdqa 1632(%rdi), %ymm2
13399; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13400; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13401; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
13402; AVX2-FCP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
13403; AVX2-FCP-NEXT:    # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7]
13404; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13405; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7]
13406; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
13407; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
13408; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13409; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
13410; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
13411; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm3
13412; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13413; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm2
13414; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13415; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
13416; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
13417; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13418; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm2
13419; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13420; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
13421; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
13422; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13423; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
13424; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
13425; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13426; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
13427; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
13428; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
13429; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13430; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
13431; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
13432; AVX2-FCP-NEXT:    vmovdqa 1056(%rdi), %ymm3
13433; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13434; AVX2-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm2
13435; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13436; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
13437; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
13438; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13439; AVX2-FCP-NEXT:    vmovdqa 960(%rdi), %ymm2
13440; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13441; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7]
13442; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13443; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
13444; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
13445; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13446; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
13447; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
13448; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
13449; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13450; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
13451; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
13452; AVX2-FCP-NEXT:    vmovdqa 1504(%rdi), %ymm3
13453; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13454; AVX2-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm2
13455; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13456; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
13457; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
13458; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13459; AVX2-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm2
13460; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13461; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
13462; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7]
13463; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13464; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
13465; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
13466; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
13467; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13468; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
13469; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
13470; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm15
13471; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm14
13472; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27]
13473; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13474; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13475; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
13476; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13477; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
13478; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
13479; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7]
13480; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13481; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13482; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
13483; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
13484; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13485; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
13486; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm0
13487; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13488; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13489; AVX2-FCP-NEXT:    vmovdqa 304(%rdi), %xmm0
13490; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13491; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
13492; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
13493; AVX2-FCP-NEXT:    vpbroadcastd 232(%rdi), %xmm1
13494; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm5
13495; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
13496; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13497; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13498; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13499; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
13500; AVX2-FCP-NEXT:    vpbroadcastd 428(%rdi), %ymm2
13501; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13502; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13503; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13504; AVX2-FCP-NEXT:    vmovdqa 752(%rdi), %xmm0
13505; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13506; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
13507; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
13508; AVX2-FCP-NEXT:    vpbroadcastd 680(%rdi), %xmm1
13509; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %xmm2
13510; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13511; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
13512; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13513; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13514; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload
13515; AVX2-FCP-NEXT:    # ymm1 = ymm3[0],mem[0],ymm3[2],mem[2]
13516; AVX2-FCP-NEXT:    vpbroadcastd 876(%rdi), %ymm2
13517; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13518; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13519; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13520; AVX2-FCP-NEXT:    vmovdqa 1200(%rdi), %xmm0
13521; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13522; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
13523; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
13524; AVX2-FCP-NEXT:    vpbroadcastd 1128(%rdi), %xmm1
13525; AVX2-FCP-NEXT:    vmovdqa 1152(%rdi), %xmm2
13526; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13527; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
13528; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13529; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13530; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13531; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
13532; AVX2-FCP-NEXT:    vpbroadcastd 1324(%rdi), %ymm2
13533; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13534; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13535; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13536; AVX2-FCP-NEXT:    vmovdqa 1648(%rdi), %xmm0
13537; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13538; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
13539; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
13540; AVX2-FCP-NEXT:    vpbroadcastd 1576(%rdi), %xmm1
13541; AVX2-FCP-NEXT:    vmovdqa 1600(%rdi), %xmm2
13542; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
13543; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13544; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13545; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
13546; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
13547; AVX2-FCP-NEXT:    vpbroadcastd 1772(%rdi), %ymm6
13548; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
13549; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13550; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13551; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
13552; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
13553; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
13554; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
13555; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
13556; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13557; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
13558; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13559; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
13560; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm6
13561; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
13562; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13563; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13564; AVX2-FCP-NEXT:    vmovdqa 528(%rdi), %xmm0
13565; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13566; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
13567; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
13568; AVX2-FCP-NEXT:    vpbroadcastd 456(%rdi), %xmm1
13569; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %xmm4
13570; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13571; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
13572; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13573; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13574; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
13575; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
13576; AVX2-FCP-NEXT:    vpbroadcastd 652(%rdi), %ymm15
13577; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
13578; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13579; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13580; AVX2-FCP-NEXT:    vmovdqa 976(%rdi), %xmm0
13581; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13582; AVX2-FCP-NEXT:    vpalignr $8, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
13583; AVX2-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
13584; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
13585; AVX2-FCP-NEXT:    vpbroadcastd 904(%rdi), %xmm15
13586; AVX2-FCP-NEXT:    vmovdqa 928(%rdi), %xmm12
13587; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3]
13588; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
13589; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13590; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
13591; AVX2-FCP-NEXT:    # ymm15 = ymm1[0],mem[0],ymm1[2],mem[2]
13592; AVX2-FCP-NEXT:    vpbroadcastd 1100(%rdi), %ymm14
13593; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
13594; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7]
13595; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13596; AVX2-FCP-NEXT:    vmovdqa 1424(%rdi), %xmm0
13597; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13598; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13599; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
13600; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7]
13601; AVX2-FCP-NEXT:    vpbroadcastd 1352(%rdi), %xmm15
13602; AVX2-FCP-NEXT:    vmovdqa 1376(%rdi), %xmm0
13603; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
13604; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
13605; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
13606; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13607; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm1[0],ymm11[2],ymm1[2]
13608; AVX2-FCP-NEXT:    vpbroadcastd 1548(%rdi), %ymm13
13609; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
13610; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7]
13611; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13612; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13613; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
13614; AVX2-FCP-NEXT:    # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
13615; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3]
13616; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3]
13617; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
13618; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
13619; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7]
13620; AVX2-FCP-NEXT:    vbroadcastss 432(%rdi), %ymm14
13621; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
13622; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7]
13623; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13624; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13625; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
13626; AVX2-FCP-NEXT:    # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
13627; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13628; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3]
13629; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
13630; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
13631; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
13632; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13633; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7]
13634; AVX2-FCP-NEXT:    vbroadcastss 880(%rdi), %ymm13
13635; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7]
13636; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7]
13637; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13638; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13639; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
13640; AVX2-FCP-NEXT:    # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
13641; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13642; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
13643; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
13644; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
13645; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
13646; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
13647; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm8[1,3],ymm14[4,6],ymm8[5,7]
13648; AVX2-FCP-NEXT:    vmovaps %ymm8, %ymm13
13649; AVX2-FCP-NEXT:    vbroadcastss 1328(%rdi), %ymm5
13650; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
13651; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
13652; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13653; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13654; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
13655; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
13656; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
13657; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
13658; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4]
13659; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
13660; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13661; AVX2-FCP-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
13662; AVX2-FCP-NEXT:    # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7]
13663; AVX2-FCP-NEXT:    vbroadcastss 1776(%rdi), %ymm4
13664; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
13665; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
13666; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13667; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4,5,6,7]
13668; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
13669; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
13670; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4]
13671; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
13672; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm1[1,3],ymm11[4,6],ymm1[5,7]
13673; AVX2-FCP-NEXT:    vbroadcastss 1552(%rdi), %ymm3
13674; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13675; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
13676; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13677; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
13678; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13679; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
13680; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
13681; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
13682; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
13683; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13684; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
13685; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13686; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm8[1,3],ymm11[4,6],ymm8[5,7]
13687; AVX2-FCP-NEXT:    vbroadcastss 1104(%rdi), %ymm2
13688; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13689; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13690; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13691; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13692; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13693; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
13694; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13695; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
13696; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
13697; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
13698; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13699; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13700; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13701; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7]
13702; AVX2-FCP-NEXT:    vbroadcastss 656(%rdi), %ymm2
13703; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13704; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13705; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13706; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13707; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13708; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
13709; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13710; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
13711; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
13712; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
13713; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13714; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13715; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
13716; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7]
13717; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm2
13718; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13719; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
13720; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13721; AVX2-FCP-NEXT:    vbroadcastss 100(%rdi), %xmm0
13722; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm3
13723; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3]
13724; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = [4,3,0,0]
13725; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13726; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13727; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
13728; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
13729; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13730; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7]
13731; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm15, %ymm2
13732; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
13733; AVX2-FCP-NEXT:    vbroadcastss 212(%rdi), %ymm4
13734; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
13735; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13736; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13737; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13738; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
13739; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
13740; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm2
13741; AVX2-FCP-NEXT:    vbroadcastss 324(%rdi), %xmm4
13742; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %xmm1
13743; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
13744; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
13745; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
13746; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
13747; AVX2-FCP-NEXT:    vbroadcastss 436(%rdi), %ymm5
13748; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
13749; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
13750; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13751; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13752; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13753; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
13754; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm4
13755; AVX2-FCP-NEXT:    vbroadcastss 548(%rdi), %xmm5
13756; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %xmm2
13757; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3]
13758; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
13759; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm15, %ymm5
13760; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
13761; AVX2-FCP-NEXT:    vbroadcastss 660(%rdi), %ymm6
13762; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
13763; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
13764; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13765; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13766; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
13767; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
13768; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm5
13769; AVX2-FCP-NEXT:    vbroadcastss 772(%rdi), %xmm6
13770; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %xmm4
13771; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3]
13772; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
13773; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload
13774; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
13775; AVX2-FCP-NEXT:    vbroadcastss 884(%rdi), %ymm7
13776; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
13777; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
13778; AVX2-FCP-NEXT:    vmovups %ymm5, (%rsp) # 32-byte Spill
13779; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13780; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
13781; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7]
13782; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm0, %ymm6
13783; AVX2-FCP-NEXT:    vbroadcastss 996(%rdi), %xmm7
13784; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %xmm5
13785; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3]
13786; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
13787; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm15, %ymm7
13788; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
13789; AVX2-FCP-NEXT:    vbroadcastss 1108(%rdi), %ymm8
13790; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
13791; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
13792; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13793; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13794; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
13795; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
13796; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
13797; AVX2-FCP-NEXT:    vbroadcastss 1220(%rdi), %xmm7
13798; AVX2-FCP-NEXT:    vmovaps 1184(%rdi), %xmm10
13799; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3]
13800; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
13801; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm15, %ymm7
13802; AVX2-FCP-NEXT:    vmovaps %ymm13, %ymm11
13803; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7]
13804; AVX2-FCP-NEXT:    vbroadcastss 1332(%rdi), %ymm8
13805; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
13806; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
13807; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13808; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13809; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
13810; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
13811; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
13812; AVX2-FCP-NEXT:    vbroadcastss 1444(%rdi), %xmm7
13813; AVX2-FCP-NEXT:    vmovaps 1408(%rdi), %xmm14
13814; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3]
13815; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
13816; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13817; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm15, %ymm7
13818; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13819; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
13820; AVX2-FCP-NEXT:    vbroadcastss 1556(%rdi), %ymm8
13821; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
13822; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
13823; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13824; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13825; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
13826; AVX2-FCP-NEXT:    # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
13827; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
13828; AVX2-FCP-NEXT:    vbroadcastss 1668(%rdi), %xmm7
13829; AVX2-FCP-NEXT:    vmovaps 1632(%rdi), %xmm0
13830; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3]
13831; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
13832; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload
13833; AVX2-FCP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
13834; AVX2-FCP-NEXT:    # ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
13835; AVX2-FCP-NEXT:    vbroadcastss 1780(%rdi), %ymm8
13836; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
13837; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
13838; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13839; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload
13840; AVX2-FCP-NEXT:    # ymm6 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
13841; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7]
13842; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm12, %ymm6
13843; AVX2-FCP-NEXT:    vbroadcastss 216(%rdi), %ymm7
13844; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
13845; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm7
13846; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
13847; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
13848; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
13849; AVX2-FCP-NEXT:    # ymm8 = mem[1,0,2,3,5,4,6,7]
13850; AVX2-FCP-NEXT:    vextractf128 $1, %ymm8, %xmm8
13851; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
13852; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
13853; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13854; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %xmm8
13855; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
13856; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
13857; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13858; AVX2-FCP-NEXT:    # ymm3 = mem[1,0,2,3,5,4,6,7]
13859; AVX2-FCP-NEXT:    vextractf128 $1, %ymm3, %xmm3
13860; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
13861; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13862; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
13863; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
13864; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm3
13865; AVX2-FCP-NEXT:    vbroadcastss 440(%rdi), %ymm6
13866; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
13867; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
13868; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13869; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %xmm6
13870; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm2[3]
13871; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
13872; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13873; AVX2-FCP-NEXT:    # ymm2 = mem[1,0,2,3,5,4,6,7]
13874; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
13875; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13876; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13877; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13878; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
13879; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
13880; AVX2-FCP-NEXT:    vbroadcastss 664(%rdi), %ymm3
13881; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
13882; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13883; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13884; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %xmm3
13885; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm4[3]
13886; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
13887; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13888; AVX2-FCP-NEXT:    # ymm2 = mem[1,0,2,3,5,4,6,7]
13889; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
13890; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13891; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13892; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13893; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
13894; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
13895; AVX2-FCP-NEXT:    vbroadcastss 888(%rdi), %ymm4
13896; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
13897; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13898; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13899; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %xmm1
13900; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm5[3]
13901; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
13902; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
13903; AVX2-FCP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
13904; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm4
13905; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
13906; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13907; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
13908; AVX2-FCP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
13909; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm12, %ymm4
13910; AVX2-FCP-NEXT:    vbroadcastss 1112(%rdi), %ymm5
13911; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
13912; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
13913; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13914; AVX2-FCP-NEXT:    vmovaps 1216(%rdi), %xmm2
13915; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm10[3]
13916; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
13917; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13918; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
13919; AVX2-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
13920; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
13921; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
13922; AVX2-FCP-NEXT:    # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
13923; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm12, %ymm5
13924; AVX2-FCP-NEXT:    vbroadcastss 1336(%rdi), %ymm10
13925; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7]
13926; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7]
13927; AVX2-FCP-NEXT:    vmovaps 1440(%rdi), %xmm4
13928; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm4[0,1,2],xmm14[3]
13929; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2]
13930; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
13931; AVX2-FCP-NEXT:    # ymm14 = mem[1,0,2,3,5,4,6,7]
13932; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm14
13933; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
13934; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7]
13935; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm12, %ymm14
13936; AVX2-FCP-NEXT:    vbroadcastss 1560(%rdi), %ymm11
13937; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7]
13938; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
13939; AVX2-FCP-NEXT:    vmovaps 1664(%rdi), %xmm14
13940; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3]
13941; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
13942; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13943; AVX2-FCP-NEXT:    # ymm11 = mem[1,0,2,3,5,4,6,7]
13944; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
13945; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3]
13946; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13947; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
13948; AVX2-FCP-NEXT:    # ymm11 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
13949; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm11
13950; AVX2-FCP-NEXT:    vbroadcastss 1784(%rdi), %ymm12
13951; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
13952; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm11[4,5,6,7]
13953; AVX2-FCP-NEXT:    vbroadcastss 136(%rdi), %xmm0
13954; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13955; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13956; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13957; AVX2-FCP-NEXT:    vpermps 192(%rdi), %ymm15, %ymm11
13958; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
13959; AVX2-FCP-NEXT:    vbroadcastss 80(%rdi), %ymm11
13960; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3]
13961; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13962; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
13963; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
13964; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
13965; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
13966; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3]
13967; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
13968; AVX2-FCP-NEXT:    vbroadcastss 360(%rdi), %xmm0
13969; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13970; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13971; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13972; AVX2-FCP-NEXT:    vpermps 416(%rdi), %ymm15, %ymm11
13973; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
13974; AVX2-FCP-NEXT:    vbroadcastss 304(%rdi), %ymm11
13975; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm8[3]
13976; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13977; AVX2-FCP-NEXT:    # ymm13 = mem[2,3,2,3,6,7,6,7]
13978; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
13979; AVX2-FCP-NEXT:    # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
13980; AVX2-FCP-NEXT:    vextractf128 $1, %ymm13, %xmm13
13981; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
13982; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm0[4,5,6,7]
13983; AVX2-FCP-NEXT:    vbroadcastss 584(%rdi), %xmm0
13984; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13985; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13986; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13987; AVX2-FCP-NEXT:    vpermps 640(%rdi), %ymm15, %ymm11
13988; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
13989; AVX2-FCP-NEXT:    vbroadcastss 528(%rdi), %ymm11
13990; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm11[0,1,2],xmm6[3]
13991; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13992; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
13993; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
13994; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
13995; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
13996; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
13997; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
13998; AVX2-FCP-NEXT:    vbroadcastss 808(%rdi), %xmm0
13999; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14000; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14001; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14002; AVX2-FCP-NEXT:    vpermps 864(%rdi), %ymm15, %ymm11
14003; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
14004; AVX2-FCP-NEXT:    vbroadcastss 752(%rdi), %ymm11
14005; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3]
14006; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
14007; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
14008; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
14009; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
14010; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
14011; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3]
14012; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
14013; AVX2-FCP-NEXT:    vbroadcastss 1032(%rdi), %xmm0
14014; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14015; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14016; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14017; AVX2-FCP-NEXT:    vpermps 1088(%rdi), %ymm15, %ymm11
14018; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
14019; AVX2-FCP-NEXT:    vbroadcastss 976(%rdi), %ymm11
14020; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3]
14021; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
14022; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
14023; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
14024; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
14025; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
14026; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
14027; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14028; AVX2-FCP-NEXT:    vbroadcastss 1256(%rdi), %xmm1
14029; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14030; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
14031; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
14032; AVX2-FCP-NEXT:    vpermps 1312(%rdi), %ymm15, %ymm11
14033; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7]
14034; AVX2-FCP-NEXT:    vbroadcastss 1200(%rdi), %ymm11
14035; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3]
14036; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
14037; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
14038; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
14039; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
14040; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
14041; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
14042; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14043; AVX2-FCP-NEXT:    vbroadcastss 1480(%rdi), %xmm2
14044; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14045; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14046; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
14047; AVX2-FCP-NEXT:    vpermps 1536(%rdi), %ymm15, %ymm11
14048; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
14049; AVX2-FCP-NEXT:    vbroadcastss 1424(%rdi), %ymm11
14050; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
14051; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
14052; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
14053; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
14054; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
14055; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
14056; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
14057; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
14058; AVX2-FCP-NEXT:    vbroadcastss 1704(%rdi), %xmm4
14059; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
14060; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
14061; AVX2-FCP-NEXT:    vpermps 1760(%rdi), %ymm15, %ymm11
14062; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
14063; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7]
14064; AVX2-FCP-NEXT:    vbroadcastss 1648(%rdi), %ymm11
14065; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm14[3]
14066; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
14067; AVX2-FCP-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
14068; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
14069; AVX2-FCP-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
14070; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm14
14071; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
14072; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
14073; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14074; AVX2-FCP-NEXT:    vmovaps %ymm11, 192(%rsi)
14075; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14076; AVX2-FCP-NEXT:    vmovaps %ymm11, 128(%rsi)
14077; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14078; AVX2-FCP-NEXT:    vmovaps %ymm11, 64(%rsi)
14079; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14080; AVX2-FCP-NEXT:    vmovaps %ymm11, (%rsi)
14081; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14082; AVX2-FCP-NEXT:    vmovaps %ymm11, 224(%rsi)
14083; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14084; AVX2-FCP-NEXT:    vmovaps %ymm11, 160(%rsi)
14085; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14086; AVX2-FCP-NEXT:    vmovaps %ymm11, 96(%rsi)
14087; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14088; AVX2-FCP-NEXT:    vmovaps %ymm11, 32(%rsi)
14089; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14090; AVX2-FCP-NEXT:    vmovaps %ymm9, 192(%rdx)
14091; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14092; AVX2-FCP-NEXT:    vmovaps %ymm9, 128(%rdx)
14093; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14094; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%rdx)
14095; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14096; AVX2-FCP-NEXT:    vmovaps %ymm9, (%rdx)
14097; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14098; AVX2-FCP-NEXT:    vmovaps %ymm9, 224(%rdx)
14099; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14100; AVX2-FCP-NEXT:    vmovaps %ymm11, 160(%rdx)
14101; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14102; AVX2-FCP-NEXT:    vmovaps %ymm11, 96(%rdx)
14103; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14104; AVX2-FCP-NEXT:    vmovaps %ymm11, 32(%rdx)
14105; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14106; AVX2-FCP-NEXT:    vmovaps %ymm9, 192(%rcx)
14107; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14108; AVX2-FCP-NEXT:    vmovaps %ymm9, 128(%rcx)
14109; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14110; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%rcx)
14111; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14112; AVX2-FCP-NEXT:    vmovaps %ymm9, (%rcx)
14113; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14114; AVX2-FCP-NEXT:    vmovaps %ymm9, 224(%rcx)
14115; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14116; AVX2-FCP-NEXT:    vmovaps %ymm9, 160(%rcx)
14117; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14118; AVX2-FCP-NEXT:    vmovaps %ymm9, 96(%rcx)
14119; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14120; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%rcx)
14121; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14122; AVX2-FCP-NEXT:    vmovaps %ymm9, (%r8)
14123; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14124; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%r8)
14125; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14126; AVX2-FCP-NEXT:    vmovaps %ymm9, 128(%r8)
14127; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14128; AVX2-FCP-NEXT:    vmovaps %ymm9, 192(%r8)
14129; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14130; AVX2-FCP-NEXT:    vmovaps %ymm9, 224(%r8)
14131; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14132; AVX2-FCP-NEXT:    vmovaps %ymm9, 160(%r8)
14133; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14134; AVX2-FCP-NEXT:    vmovaps %ymm9, 96(%r8)
14135; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14136; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%r8)
14137; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14138; AVX2-FCP-NEXT:    vmovaps %ymm9, 224(%r9)
14139; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14140; AVX2-FCP-NEXT:    vmovaps %ymm9, 192(%r9)
14141; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14142; AVX2-FCP-NEXT:    vmovaps %ymm9, 160(%r9)
14143; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14144; AVX2-FCP-NEXT:    vmovaps %ymm9, 128(%r9)
14145; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
14146; AVX2-FCP-NEXT:    vmovaps %ymm9, 96(%r9)
14147; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14148; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%r9)
14149; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14150; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%r9)
14151; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14152; AVX2-FCP-NEXT:    vmovaps %ymm9, (%r9)
14153; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14154; AVX2-FCP-NEXT:    vmovaps %ymm12, 224(%rax)
14155; AVX2-FCP-NEXT:    vmovaps %ymm10, 192(%rax)
14156; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rax)
14157; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14158; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rax)
14159; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14160; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rax)
14161; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14162; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rax)
14163; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14164; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rax)
14165; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14166; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rax)
14167; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14168; AVX2-FCP-NEXT:    vmovaps %ymm4, 224(%rax)
14169; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%rax)
14170; AVX2-FCP-NEXT:    vmovaps %ymm1, 160(%rax)
14171; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rax)
14172; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%rax)
14173; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%rax)
14174; AVX2-FCP-NEXT:    vmovaps %ymm13, 32(%rax)
14175; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rax)
14176; AVX2-FCP-NEXT:    addq $2648, %rsp # imm = 0xA58
14177; AVX2-FCP-NEXT:    vzeroupper
14178; AVX2-FCP-NEXT:    retq
14179;
14180; AVX512-LABEL: load_i32_stride7_vf64:
14181; AVX512:       # %bb.0:
14182; AVX512-NEXT:    subq $3400, %rsp # imm = 0xD48
14183; AVX512-NEXT:    vmovdqa64 1728(%rdi), %zmm2
14184; AVX512-NEXT:    vmovdqa64 1664(%rdi), %zmm17
14185; AVX512-NEXT:    vmovdqa64 1600(%rdi), %zmm11
14186; AVX512-NEXT:    vmovdqa64 1280(%rdi), %zmm7
14187; AVX512-NEXT:    vmovdqa64 1216(%rdi), %zmm5
14188; AVX512-NEXT:    vmovdqa64 1152(%rdi), %zmm12
14189; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm6
14190; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm8
14191; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm13
14192; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm20
14193; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm4
14194; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm14
14195; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
14196; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14197; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm3
14198; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
14199; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
14200; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14201; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14202; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14203; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm3
14204; AVX512-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
14205; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14206; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14207; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm3
14208; AVX512-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
14209; AVX512-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14210; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14211; AVX512-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
14212; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14213; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14214; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
14215; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
14216; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm3
14217; AVX512-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
14218; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
14219; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14220; AVX512-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14221; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14222; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm3
14223; AVX512-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
14224; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14225; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14226; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm3
14227; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
14228; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14229; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14230; AVX512-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
14231; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14232; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14233; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
14234; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
14235; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm3
14236; AVX512-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
14237; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
14238; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14239; AVX512-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14240; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14241; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm3
14242; AVX512-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
14243; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14244; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14245; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm3
14246; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
14247; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14248; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14249; AVX512-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
14250; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14251; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14252; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
14253; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
14254; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm3
14255; AVX512-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
14256; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
14257; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14258; AVX512-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14259; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14260; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm3
14261; AVX512-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
14262; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14263; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14264; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm3
14265; AVX512-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
14266; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14267; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14268; AVX512-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
14269; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14270; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14271; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
14272; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
14273; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm3
14274; AVX512-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
14275; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
14276; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14277; AVX512-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14278; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14279; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm3
14280; AVX512-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
14281; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14282; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14283; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm3
14284; AVX512-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
14285; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14286; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14287; AVX512-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
14288; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14289; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14290; AVX512-NEXT:    vmovdqa64 1024(%rdi), %zmm3
14291; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm15
14292; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
14293; AVX512-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14294; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0
14295; AVX512-NEXT:    vpermt2d %zmm15, %zmm30, %zmm0
14296; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14297; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm9
14298; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm16
14299; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm0
14300; AVX512-NEXT:    vpermt2d %zmm16, %zmm30, %zmm0
14301; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14302; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm0
14303; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm18
14304; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1
14305; AVX512-NEXT:    vpermt2d %zmm18, %zmm30, %zmm1
14306; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14307; AVX512-NEXT:    vmovdqa64 1472(%rdi), %zmm1
14308; AVX512-NEXT:    vmovdqa64 1536(%rdi), %zmm19
14309; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm10
14310; AVX512-NEXT:    vpermt2d %zmm19, %zmm30, %zmm10
14311; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14312; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm21
14313; AVX512-NEXT:    vpermt2d %zmm5, %zmm30, %zmm21
14314; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
14315; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
14316; AVX512-NEXT:    vpermt2d %zmm7, %zmm10, %zmm21
14317; AVX512-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14318; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm21
14319; AVX512-NEXT:    vpermt2d %zmm8, %zmm30, %zmm21
14320; AVX512-NEXT:    vpermt2d %zmm6, %zmm10, %zmm21
14321; AVX512-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14322; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm21
14323; AVX512-NEXT:    vpermt2d %zmm4, %zmm30, %zmm21
14324; AVX512-NEXT:    vpermt2d %zmm20, %zmm10, %zmm21
14325; AVX512-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14326; AVX512-NEXT:    vpermi2d %zmm17, %zmm11, %zmm30
14327; AVX512-NEXT:    vpermt2d %zmm2, %zmm10, %zmm30
14328; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
14329; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14330; AVX512-NEXT:    vpermt2d %zmm8, %zmm10, %zmm13
14331; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
14332; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
14333; AVX512-NEXT:    vpermt2d %zmm6, %zmm8, %zmm13
14334; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14335; AVX512-NEXT:    vpermt2d %zmm4, %zmm10, %zmm14
14336; AVX512-NEXT:    vpermt2d %zmm20, %zmm8, %zmm14
14337; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14338; AVX512-NEXT:    vpermt2d %zmm17, %zmm10, %zmm11
14339; AVX512-NEXT:    vpermt2d %zmm2, %zmm8, %zmm11
14340; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14341; AVX512-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
14342; AVX512-NEXT:    vpermt2d %zmm7, %zmm8, %zmm12
14343; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14344; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm2
14345; AVX512-NEXT:    vpermt2d %zmm15, %zmm10, %zmm2
14346; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14347; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm2
14348; AVX512-NEXT:    vpermt2d %zmm16, %zmm10, %zmm2
14349; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14350; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2
14351; AVX512-NEXT:    vpermt2d %zmm18, %zmm10, %zmm2
14352; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14353; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
14354; AVX512-NEXT:    vpermt2d %zmm19, %zmm10, %zmm2
14355; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14356; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
14357; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
14358; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm2
14359; AVX512-NEXT:    vpermt2d %zmm9, %zmm25, %zmm2
14360; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14361; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
14362; AVX512-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14363; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm2
14364; AVX512-NEXT:    vpermt2d %zmm16, %zmm27, %zmm2
14365; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14366; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
14367; AVX512-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
14368; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm2
14369; AVX512-NEXT:    vpermt2d %zmm16, %zmm28, %zmm2
14370; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14371; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
14372; AVX512-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
14373; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm2
14374; AVX512-NEXT:    vpermt2d %zmm16, %zmm31, %zmm2
14375; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14376; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
14377; AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
14378; AVX512-NEXT:    vpermt2d %zmm9, %zmm2, %zmm16
14379; AVX512-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14380; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm4
14381; AVX512-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
14382; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14383; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm4
14384; AVX512-NEXT:    vpermt2d %zmm18, %zmm27, %zmm4
14385; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14386; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm4
14387; AVX512-NEXT:    vpermt2d %zmm18, %zmm28, %zmm4
14388; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14389; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm4
14390; AVX512-NEXT:    vpermt2d %zmm18, %zmm31, %zmm4
14391; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14392; AVX512-NEXT:    vpermt2d %zmm0, %zmm2, %zmm18
14393; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14394; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm0
14395; AVX512-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
14396; AVX512-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
14397; AVX512-NEXT:    vpermi2d %zmm1, %zmm19, %zmm25
14398; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0
14399; AVX512-NEXT:    vpermt2d %zmm15, %zmm27, %zmm0
14400; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14401; AVX512-NEXT:    vpermi2d %zmm19, %zmm1, %zmm27
14402; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0
14403; AVX512-NEXT:    vpermt2d %zmm15, %zmm28, %zmm0
14404; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14405; AVX512-NEXT:    vpermi2d %zmm19, %zmm1, %zmm28
14406; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0
14407; AVX512-NEXT:    vpermt2d %zmm15, %zmm31, %zmm0
14408; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14409; AVX512-NEXT:    vpermi2d %zmm19, %zmm1, %zmm31
14410; AVX512-NEXT:    vpermt2d %zmm1, %zmm2, %zmm19
14411; AVX512-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14412; AVX512-NEXT:    vpermt2d %zmm3, %zmm2, %zmm15
14413; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14414; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm0
14415; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm17
14416; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0]
14417; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm22
14418; AVX512-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
14419; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0]
14420; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm23
14421; AVX512-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
14422; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0]
14423; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm24
14424; AVX512-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
14425; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0]
14426; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm29
14427; AVX512-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
14428; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25]
14429; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm1
14430; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
14431; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14432; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26]
14433; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm1
14434; AVX512-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
14435; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14436; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27]
14437; AVX512-NEXT:    vpermt2d %zmm0, %zmm20, %zmm17
14438; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm5
14439; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm0
14440; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
14441; AVX512-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
14442; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm14
14443; AVX512-NEXT:    vpermt2d %zmm0, %zmm3, %zmm14
14444; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm15
14445; AVX512-NEXT:    vpermt2d %zmm5, %zmm4, %zmm15
14446; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm16
14447; AVX512-NEXT:    vpermt2d %zmm5, %zmm7, %zmm16
14448; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
14449; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
14450; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14451; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
14452; AVX512-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
14453; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14454; AVX512-NEXT:    vpermt2d %zmm0, %zmm20, %zmm5
14455; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm9
14456; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm6
14457; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm8
14458; AVX512-NEXT:    vpermt2d %zmm9, %zmm2, %zmm8
14459; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm0
14460; AVX512-NEXT:    vmovdqa64 1344(%rdi), %zmm1
14461; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
14462; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm10
14463; AVX512-NEXT:    vpermt2d %zmm9, %zmm3, %zmm10
14464; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
14465; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm11
14466; AVX512-NEXT:    vpermt2d %zmm6, %zmm4, %zmm11
14467; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
14468; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm12
14469; AVX512-NEXT:    vpermt2d %zmm6, %zmm7, %zmm12
14470; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
14471; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm21
14472; AVX512-NEXT:    vpermt2d %zmm9, %zmm18, %zmm21
14473; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm18
14474; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm26
14475; AVX512-NEXT:    vpermt2d %zmm9, %zmm19, %zmm26
14476; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
14477; AVX512-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
14478; AVX512-NEXT:    vpermt2d %zmm9, %zmm20, %zmm6
14479; AVX512-NEXT:    movw $992, %ax # imm = 0x3E0
14480; AVX512-NEXT:    kmovw %eax, %k1
14481; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14482; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm13 {%k1}
14483; AVX512-NEXT:    movb $-32, %al
14484; AVX512-NEXT:    kmovw %eax, %k2
14485; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14486; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
14487; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14488; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
14489; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14490; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
14491; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
14492; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
14493; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14494; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
14495; AVX512-NEXT:    vmovdqa32 %zmm25, %zmm2 {%k1}
14496; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14497; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
14498; AVX512-NEXT:    movw $480, %ax # imm = 0x1E0
14499; AVX512-NEXT:    kmovw %eax, %k2
14500; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14501; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k2}
14502; AVX512-NEXT:    movw $-512, %ax # imm = 0xFE00
14503; AVX512-NEXT:    kmovw %eax, %k1
14504; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14505; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
14506; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14507; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k2}
14508; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14509; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
14510; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14511; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k2}
14512; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14513; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k1}
14514; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14515; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k2}
14516; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14517; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
14518; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14519; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k2}
14520; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14521; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
14522; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14523; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k2}
14524; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14525; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k1}
14526; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14527; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k2}
14528; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14529; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
14530; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14531; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k2}
14532; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14533; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
14534; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14535; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k2}
14536; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14537; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
14538; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14539; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k2}
14540; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14541; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
14542; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14543; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k2}
14544; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14545; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
14546; AVX512-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
14547; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14548; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
14549; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14550; AVX512-NEXT:    vinserti32x4 $0, %xmm21, %zmm0, %zmm0
14551; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
14552; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k1}
14553; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
14554; AVX512-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
14555; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
14556; AVX512-NEXT:    vmovdqa32 %zmm20, %zmm9 {%k1}
14557; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
14558; AVX512-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload
14559; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
14560; AVX512-NEXT:    vmovdqa32 %zmm25, %zmm20 {%k1}
14561; AVX512-NEXT:    vinserti32x4 $0, %xmm18, %zmm28, %zmm18
14562; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
14563; AVX512-NEXT:    vmovdqa32 %zmm25, %zmm18 {%k1}
14564; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
14565; AVX512-NEXT:    vinserti32x4 $0, %xmm26, %zmm25, %zmm25
14566; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
14567; AVX512-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
14568; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
14569; AVX512-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload
14570; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
14571; AVX512-NEXT:    vmovdqa32 %zmm27, %zmm26 {%k1}
14572; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
14573; AVX512-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload
14574; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14575; AVX512-NEXT:    vmovdqa32 %zmm28, %zmm27 {%k1}
14576; AVX512-NEXT:    vinserti32x4 $0, %xmm19, %zmm31, %zmm19
14577; AVX512-NEXT:    vmovdqa32 %zmm30, %zmm19 {%k1}
14578; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14579; AVX512-NEXT:    vinserti32x4 $0, %xmm17, %zmm28, %zmm17
14580; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14581; AVX512-NEXT:    vmovdqa32 %zmm28, %zmm17 {%k1}
14582; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14583; AVX512-NEXT:    vinserti32x4 $0, %xmm5, %zmm28, %zmm5
14584; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14585; AVX512-NEXT:    vmovdqa32 %zmm28, %zmm5 {%k1}
14586; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14587; AVX512-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm1
14588; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14589; AVX512-NEXT:    vmovdqa32 %zmm28, %zmm1 {%k1}
14590; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14591; AVX512-NEXT:    vinserti32x4 $0, %xmm6, %zmm28, %zmm6
14592; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
14593; AVX512-NEXT:    vmovdqa32 %zmm28, %zmm6 {%k1}
14594; AVX512-NEXT:    vmovdqa64 %zmm2, 192(%rsi)
14595; AVX512-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
14596; AVX512-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
14597; AVX512-NEXT:    vmovdqa64 %zmm13, (%rsi)
14598; AVX512-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
14599; AVX512-NEXT:    vmovdqa64 %zmm14, (%rdx)
14600; AVX512-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
14601; AVX512-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
14602; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
14603; AVX512-NEXT:    vmovdqa64 %zmm15, (%rcx)
14604; AVX512-NEXT:    vmovdqa64 %zmm24, 64(%rcx)
14605; AVX512-NEXT:    vmovdqa64 %zmm11, 128(%rcx)
14606; AVX512-NEXT:    vmovdqa64 %zmm7, 192(%r8)
14607; AVX512-NEXT:    vmovdqa64 %zmm16, (%r8)
14608; AVX512-NEXT:    vmovdqa64 %zmm29, 64(%r8)
14609; AVX512-NEXT:    vmovdqa64 %zmm12, 128(%r8)
14610; AVX512-NEXT:    vmovdqa64 %zmm18, 192(%r9)
14611; AVX512-NEXT:    vmovdqa64 %zmm20, (%r9)
14612; AVX512-NEXT:    vmovdqa64 %zmm9, 64(%r9)
14613; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%r9)
14614; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14615; AVX512-NEXT:    vmovdqa64 %zmm19, 192(%rax)
14616; AVX512-NEXT:    vmovdqa64 %zmm27, (%rax)
14617; AVX512-NEXT:    vmovdqa64 %zmm26, 64(%rax)
14618; AVX512-NEXT:    vmovdqa64 %zmm25, 128(%rax)
14619; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14620; AVX512-NEXT:    vmovdqa64 %zmm6, 128(%rax)
14621; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
14622; AVX512-NEXT:    vmovdqa64 %zmm5, (%rax)
14623; AVX512-NEXT:    vmovdqa64 %zmm17, 64(%rax)
14624; AVX512-NEXT:    addq $3400, %rsp # imm = 0xD48
14625; AVX512-NEXT:    vzeroupper
14626; AVX512-NEXT:    retq
14627;
14628; AVX512-FCP-LABEL: load_i32_stride7_vf64:
14629; AVX512-FCP:       # %bb.0:
14630; AVX512-FCP-NEXT:    subq $3400, %rsp # imm = 0xD48
14631; AVX512-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm2
14632; AVX512-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm17
14633; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm11
14634; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm7
14635; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm5
14636; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm12
14637; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
14638; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm8
14639; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm13
14640; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm20
14641; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm4
14642; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm14
14643; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
14644; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14645; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
14646; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
14647; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
14648; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14649; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14650; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14651; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
14652; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
14653; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14654; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14655; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
14656; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
14657; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14658; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14659; AVX512-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
14660; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14661; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14662; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
14663; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
14664; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
14665; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
14666; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
14667; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14668; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14669; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14670; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
14671; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
14672; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14673; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14674; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
14675; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
14676; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14677; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14678; AVX512-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
14679; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14680; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14681; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
14682; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
14683; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
14684; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
14685; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
14686; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14687; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14688; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14689; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
14690; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
14691; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14692; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14693; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
14694; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
14695; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14696; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14697; AVX512-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
14698; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14699; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14700; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
14701; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
14702; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
14703; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
14704; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
14705; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14706; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14707; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14708; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
14709; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
14710; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14711; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14712; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
14713; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
14714; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14715; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14716; AVX512-FCP-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
14717; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14718; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14719; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
14720; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
14721; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
14722; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
14723; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
14724; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
14725; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
14726; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14727; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
14728; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
14729; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
14730; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14731; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
14732; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
14733; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
14734; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14735; AVX512-FCP-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
14736; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
14737; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14738; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm3
14739; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm15
14740; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
14741; AVX512-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14742; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
14743; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm30, %zmm0
14744; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14745; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm9
14746; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
14747; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
14748; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm30, %zmm0
14749; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14750; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
14751; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm18
14752; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
14753; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm30, %zmm1
14754; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14755; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm1
14756; AVX512-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm19
14757; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
14758; AVX512-FCP-NEXT:    vpermt2d %zmm19, %zmm30, %zmm10
14759; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14760; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm21
14761; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm30, %zmm21
14762; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
14763; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
14764; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm21
14765; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14766; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm21
14767; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm30, %zmm21
14768; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm10, %zmm21
14769; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14770; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm21
14771; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm30, %zmm21
14772; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm10, %zmm21
14773; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14774; AVX512-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm30
14775; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm10, %zmm30
14776; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
14777; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14778; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm10, %zmm13
14779; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
14780; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
14781; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm8, %zmm13
14782; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14783; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm14
14784; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm8, %zmm14
14785; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14786; AVX512-FCP-NEXT:    vpermt2d %zmm17, %zmm10, %zmm11
14787; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm11
14788; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14789; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
14790; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm12
14791; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14792; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
14793; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm10, %zmm2
14794; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14795; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
14796; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm10, %zmm2
14797; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14798; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2
14799; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm10, %zmm2
14800; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14801; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
14802; AVX512-FCP-NEXT:    vpermt2d %zmm19, %zmm10, %zmm2
14803; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14804; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
14805; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
14806; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm2
14807; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm2
14808; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14809; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
14810; AVX512-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14811; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
14812; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm27, %zmm2
14813; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14814; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
14815; AVX512-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
14816; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
14817; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm28, %zmm2
14818; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14819; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
14820; AVX512-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
14821; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
14822; AVX512-FCP-NEXT:    vpermt2d %zmm16, %zmm31, %zmm2
14823; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14824; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
14825; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
14826; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm2, %zmm16
14827; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14828; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm4
14829; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
14830; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14831; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
14832; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm27, %zmm4
14833; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14834; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
14835; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm28, %zmm4
14836; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14837; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
14838; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm31, %zmm4
14839; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14840; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm18
14841; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14842; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
14843; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
14844; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
14845; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm19, %zmm25
14846; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
14847; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm27, %zmm0
14848; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14849; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm27
14850; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
14851; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm28, %zmm0
14852; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14853; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm28
14854; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
14855; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm31, %zmm0
14856; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14857; AVX512-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm31
14858; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm2, %zmm19
14859; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14860; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm2, %zmm15
14861; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14862; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm0
14863; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm17
14864; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0]
14865; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm22
14866; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
14867; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0]
14868; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm23
14869; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
14870; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0]
14871; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24
14872; AVX512-FCP-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
14873; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0]
14874; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29
14875; AVX512-FCP-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
14876; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25]
14877; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
14878; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
14879; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14880; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26]
14881; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
14882; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
14883; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14884; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27]
14885; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm17
14886; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
14887; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
14888; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
14889; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
14890; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
14891; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm3, %zmm14
14892; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15
14893; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm4, %zmm15
14894; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
14895; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm7, %zmm16
14896; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
14897; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
14898; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14899; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
14900; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
14901; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14902; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm5
14903; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm9
14904; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm6
14905; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
14906; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm2, %zmm8
14907; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm0
14908; AVX512-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm1
14909; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
14910; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
14911; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm3, %zmm10
14912; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
14913; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
14914; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm4, %zmm11
14915; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
14916; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
14917; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm7, %zmm12
14918; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
14919; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm21
14920; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm18, %zmm21
14921; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm18
14922; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm26
14923; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm26
14924; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
14925; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
14926; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm20, %zmm6
14927; AVX512-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
14928; AVX512-FCP-NEXT:    kmovw %eax, %k1
14929; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14930; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm13 {%k1}
14931; AVX512-FCP-NEXT:    movb $-32, %al
14932; AVX512-FCP-NEXT:    kmovw %eax, %k2
14933; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14934; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
14935; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14936; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
14937; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14938; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
14939; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
14940; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
14941; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14942; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
14943; AVX512-FCP-NEXT:    vmovdqa32 %zmm25, %zmm2 {%k1}
14944; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14945; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
14946; AVX512-FCP-NEXT:    movw $480, %ax # imm = 0x1E0
14947; AVX512-FCP-NEXT:    kmovw %eax, %k2
14948; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14949; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k2}
14950; AVX512-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
14951; AVX512-FCP-NEXT:    kmovw %eax, %k1
14952; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14953; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
14954; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14955; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k2}
14956; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14957; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
14958; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14959; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k2}
14960; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14961; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k1}
14962; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14963; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k2}
14964; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14965; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
14966; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14967; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k2}
14968; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14969; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
14970; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14971; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k2}
14972; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14973; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k1}
14974; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14975; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k2}
14976; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14977; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
14978; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14979; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k2}
14980; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14981; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
14982; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14983; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k2}
14984; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14985; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
14986; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14987; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k2}
14988; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14989; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
14990; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14991; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k2}
14992; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14993; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
14994; AVX512-FCP-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
14995; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14996; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
14997; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14998; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm21, %zmm0, %zmm0
14999; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15000; AVX512-FCP-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k1}
15001; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15002; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
15003; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
15004; AVX512-FCP-NEXT:    vmovdqa32 %zmm20, %zmm9 {%k1}
15005; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
15006; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload
15007; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15008; AVX512-FCP-NEXT:    vmovdqa32 %zmm25, %zmm20 {%k1}
15009; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm18, %zmm28, %zmm18
15010; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15011; AVX512-FCP-NEXT:    vmovdqa32 %zmm25, %zmm18 {%k1}
15012; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15013; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm26, %zmm25, %zmm25
15014; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
15015; AVX512-FCP-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
15016; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
15017; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload
15018; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
15019; AVX512-FCP-NEXT:    vmovdqa32 %zmm27, %zmm26 {%k1}
15020; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
15021; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload
15022; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15023; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm27 {%k1}
15024; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm19, %zmm31, %zmm19
15025; AVX512-FCP-NEXT:    vmovdqa32 %zmm30, %zmm19 {%k1}
15026; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15027; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm28, %zmm17
15028; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15029; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm17 {%k1}
15030; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15031; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm28, %zmm5
15032; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15033; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm5 {%k1}
15034; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15035; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm1
15036; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15037; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm1 {%k1}
15038; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15039; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm28, %zmm6
15040; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15041; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm6 {%k1}
15042; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rsi)
15043; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
15044; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
15045; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, (%rsi)
15046; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
15047; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
15048; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
15049; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
15050; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
15051; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
15052; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, 64(%rcx)
15053; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 128(%rcx)
15054; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 192(%r8)
15055; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
15056; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 64(%r8)
15057; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 128(%r8)
15058; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 192(%r9)
15059; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%r9)
15060; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 64(%r9)
15061; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
15062; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15063; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 192(%rax)
15064; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, (%rax)
15065; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, 64(%rax)
15066; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 128(%rax)
15067; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15068; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
15069; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
15070; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
15071; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
15072; AVX512-FCP-NEXT:    addq $3400, %rsp # imm = 0xD48
15073; AVX512-FCP-NEXT:    vzeroupper
15074; AVX512-FCP-NEXT:    retq
15075;
15076; AVX512DQ-LABEL: load_i32_stride7_vf64:
15077; AVX512DQ:       # %bb.0:
15078; AVX512DQ-NEXT:    subq $3400, %rsp # imm = 0xD48
15079; AVX512DQ-NEXT:    vmovdqa64 1728(%rdi), %zmm2
15080; AVX512DQ-NEXT:    vmovdqa64 1664(%rdi), %zmm17
15081; AVX512DQ-NEXT:    vmovdqa64 1600(%rdi), %zmm11
15082; AVX512DQ-NEXT:    vmovdqa64 1280(%rdi), %zmm7
15083; AVX512DQ-NEXT:    vmovdqa64 1216(%rdi), %zmm5
15084; AVX512DQ-NEXT:    vmovdqa64 1152(%rdi), %zmm12
15085; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm6
15086; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm8
15087; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm13
15088; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm20
15089; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm4
15090; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm14
15091; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
15092; AVX512DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15093; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm3
15094; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
15095; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
15096; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15097; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15098; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15099; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm3
15100; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
15101; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15102; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15103; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm3
15104; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
15105; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15106; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15107; AVX512DQ-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
15108; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15109; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15110; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
15111; AVX512DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
15112; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm3
15113; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
15114; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
15115; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15116; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15117; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15118; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm3
15119; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
15120; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15121; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15122; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm3
15123; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
15124; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15125; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15126; AVX512DQ-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
15127; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15128; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15129; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
15130; AVX512DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
15131; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm3
15132; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
15133; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
15134; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15135; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15136; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15137; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm3
15138; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
15139; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15140; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15141; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm3
15142; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
15143; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15144; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15145; AVX512DQ-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
15146; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15147; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15148; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
15149; AVX512DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
15150; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm3
15151; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
15152; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
15153; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15154; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15155; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15156; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm3
15157; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
15158; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15159; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15160; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm3
15161; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
15162; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15163; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15164; AVX512DQ-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
15165; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15166; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15167; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
15168; AVX512DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
15169; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm3
15170; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
15171; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
15172; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15173; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15174; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15175; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm3
15176; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
15177; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15178; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15179; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm3
15180; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
15181; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15182; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15183; AVX512DQ-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
15184; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15185; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15186; AVX512DQ-NEXT:    vmovdqa64 1024(%rdi), %zmm3
15187; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm15
15188; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
15189; AVX512DQ-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15190; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0
15191; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm30, %zmm0
15192; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15193; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm9
15194; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm16
15195; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm0
15196; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm30, %zmm0
15197; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15198; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm0
15199; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm18
15200; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm1
15201; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm30, %zmm1
15202; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15203; AVX512DQ-NEXT:    vmovdqa64 1472(%rdi), %zmm1
15204; AVX512DQ-NEXT:    vmovdqa64 1536(%rdi), %zmm19
15205; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm10
15206; AVX512DQ-NEXT:    vpermt2d %zmm19, %zmm30, %zmm10
15207; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15208; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm21
15209; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm30, %zmm21
15210; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
15211; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
15212; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm10, %zmm21
15213; AVX512DQ-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15214; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm21
15215; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm30, %zmm21
15216; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm10, %zmm21
15217; AVX512DQ-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15218; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm21
15219; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm30, %zmm21
15220; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm10, %zmm21
15221; AVX512DQ-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15222; AVX512DQ-NEXT:    vpermi2d %zmm17, %zmm11, %zmm30
15223; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm10, %zmm30
15224; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
15225; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15226; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm10, %zmm13
15227; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
15228; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
15229; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm8, %zmm13
15230; AVX512DQ-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15231; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm10, %zmm14
15232; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm8, %zmm14
15233; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15234; AVX512DQ-NEXT:    vpermt2d %zmm17, %zmm10, %zmm11
15235; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm8, %zmm11
15236; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15237; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
15238; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm8, %zmm12
15239; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15240; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm2
15241; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm10, %zmm2
15242; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15243; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm2
15244; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm10, %zmm2
15245; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15246; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm2
15247; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm10, %zmm2
15248; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15249; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2
15250; AVX512DQ-NEXT:    vpermt2d %zmm19, %zmm10, %zmm2
15251; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15252; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
15253; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
15254; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm2
15255; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm25, %zmm2
15256; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15257; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
15258; AVX512DQ-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15259; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm2
15260; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm27, %zmm2
15261; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15262; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
15263; AVX512DQ-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
15264; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm2
15265; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm28, %zmm2
15266; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15267; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
15268; AVX512DQ-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
15269; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm2
15270; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm31, %zmm2
15271; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15272; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
15273; AVX512DQ-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
15274; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm2, %zmm16
15275; AVX512DQ-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15276; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm4
15277; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
15278; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15279; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm4
15280; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm27, %zmm4
15281; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15282; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm4
15283; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm28, %zmm4
15284; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15285; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm4
15286; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm31, %zmm4
15287; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15288; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm18
15289; AVX512DQ-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15290; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm0
15291; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
15292; AVX512DQ-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
15293; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm19, %zmm25
15294; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0
15295; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm27, %zmm0
15296; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15297; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm1, %zmm27
15298; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0
15299; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm28, %zmm0
15300; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15301; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm1, %zmm28
15302; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0
15303; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm31, %zmm0
15304; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15305; AVX512DQ-NEXT:    vpermi2d %zmm19, %zmm1, %zmm31
15306; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm19
15307; AVX512DQ-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15308; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm2, %zmm15
15309; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15310; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm0
15311; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm17
15312; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0]
15313; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm22
15314; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
15315; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0]
15316; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm23
15317; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
15318; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0]
15319; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm24
15320; AVX512DQ-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
15321; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0]
15322; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm29
15323; AVX512DQ-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
15324; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25]
15325; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm1
15326; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
15327; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15328; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26]
15329; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm1
15330; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
15331; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15332; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27]
15333; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm20, %zmm17
15334; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm5
15335; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm0
15336; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
15337; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
15338; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm14
15339; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm3, %zmm14
15340; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm15
15341; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm4, %zmm15
15342; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm16
15343; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm7, %zmm16
15344; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1
15345; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
15346; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15347; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1
15348; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
15349; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15350; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm20, %zmm5
15351; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm9
15352; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm6
15353; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm8
15354; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm2, %zmm8
15355; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm0
15356; AVX512DQ-NEXT:    vmovdqa64 1344(%rdi), %zmm1
15357; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
15358; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm10
15359; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm3, %zmm10
15360; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
15361; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm11
15362; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm4, %zmm11
15363; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
15364; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm12
15365; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm7, %zmm12
15366; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
15367; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm21
15368; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm18, %zmm21
15369; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm18
15370; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm26
15371; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm19, %zmm26
15372; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
15373; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
15374; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm20, %zmm6
15375; AVX512DQ-NEXT:    movw $992, %ax # imm = 0x3E0
15376; AVX512DQ-NEXT:    kmovw %eax, %k1
15377; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15378; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm13 {%k1}
15379; AVX512DQ-NEXT:    movb $-32, %al
15380; AVX512DQ-NEXT:    kmovw %eax, %k2
15381; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15382; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
15383; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15384; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
15385; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15386; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
15387; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
15388; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
15389; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15390; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
15391; AVX512DQ-NEXT:    vmovdqa32 %zmm25, %zmm2 {%k1}
15392; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15393; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
15394; AVX512DQ-NEXT:    movw $480, %ax # imm = 0x1E0
15395; AVX512DQ-NEXT:    kmovw %eax, %k2
15396; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15397; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k2}
15398; AVX512DQ-NEXT:    movw $-512, %ax # imm = 0xFE00
15399; AVX512DQ-NEXT:    kmovw %eax, %k1
15400; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15401; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
15402; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15403; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k2}
15404; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15405; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
15406; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15407; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k2}
15408; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15409; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k1}
15410; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15411; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k2}
15412; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15413; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
15414; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15415; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k2}
15416; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15417; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
15418; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15419; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k2}
15420; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15421; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k1}
15422; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15423; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k2}
15424; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15425; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
15426; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15427; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k2}
15428; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15429; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
15430; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15431; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k2}
15432; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15433; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
15434; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15435; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k2}
15436; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15437; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
15438; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15439; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k2}
15440; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15441; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
15442; AVX512DQ-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
15443; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15444; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
15445; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15446; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm21, %zmm0, %zmm0
15447; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15448; AVX512DQ-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k1}
15449; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15450; AVX512DQ-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
15451; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
15452; AVX512DQ-NEXT:    vmovdqa32 %zmm20, %zmm9 {%k1}
15453; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
15454; AVX512DQ-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload
15455; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15456; AVX512DQ-NEXT:    vmovdqa32 %zmm25, %zmm20 {%k1}
15457; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm18, %zmm28, %zmm18
15458; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15459; AVX512DQ-NEXT:    vmovdqa32 %zmm25, %zmm18 {%k1}
15460; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15461; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm26, %zmm25, %zmm25
15462; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
15463; AVX512DQ-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
15464; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
15465; AVX512DQ-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload
15466; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
15467; AVX512DQ-NEXT:    vmovdqa32 %zmm27, %zmm26 {%k1}
15468; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
15469; AVX512DQ-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload
15470; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15471; AVX512DQ-NEXT:    vmovdqa32 %zmm28, %zmm27 {%k1}
15472; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm19, %zmm31, %zmm19
15473; AVX512DQ-NEXT:    vmovdqa32 %zmm30, %zmm19 {%k1}
15474; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15475; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm17, %zmm28, %zmm17
15476; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15477; AVX512DQ-NEXT:    vmovdqa32 %zmm28, %zmm17 {%k1}
15478; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15479; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm5, %zmm28, %zmm5
15480; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15481; AVX512DQ-NEXT:    vmovdqa32 %zmm28, %zmm5 {%k1}
15482; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15483; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm1
15484; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15485; AVX512DQ-NEXT:    vmovdqa32 %zmm28, %zmm1 {%k1}
15486; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15487; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm6, %zmm28, %zmm6
15488; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15489; AVX512DQ-NEXT:    vmovdqa32 %zmm28, %zmm6 {%k1}
15490; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rsi)
15491; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
15492; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
15493; AVX512DQ-NEXT:    vmovdqa64 %zmm13, (%rsi)
15494; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
15495; AVX512DQ-NEXT:    vmovdqa64 %zmm14, (%rdx)
15496; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
15497; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
15498; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
15499; AVX512DQ-NEXT:    vmovdqa64 %zmm15, (%rcx)
15500; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 64(%rcx)
15501; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 128(%rcx)
15502; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 192(%r8)
15503; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%r8)
15504; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 64(%r8)
15505; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 128(%r8)
15506; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 192(%r9)
15507; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%r9)
15508; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 64(%r9)
15509; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%r9)
15510; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15511; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 192(%rax)
15512; AVX512DQ-NEXT:    vmovdqa64 %zmm27, (%rax)
15513; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 64(%rax)
15514; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 128(%rax)
15515; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15516; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rax)
15517; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
15518; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
15519; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 64(%rax)
15520; AVX512DQ-NEXT:    addq $3400, %rsp # imm = 0xD48
15521; AVX512DQ-NEXT:    vzeroupper
15522; AVX512DQ-NEXT:    retq
15523;
15524; AVX512DQ-FCP-LABEL: load_i32_stride7_vf64:
15525; AVX512DQ-FCP:       # %bb.0:
15526; AVX512DQ-FCP-NEXT:    subq $3400, %rsp # imm = 0xD48
15527; AVX512DQ-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm2
15528; AVX512DQ-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm17
15529; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm11
15530; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm7
15531; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm5
15532; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm12
15533; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
15534; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm8
15535; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm13
15536; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm20
15537; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm4
15538; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm14
15539; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
15540; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15541; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
15542; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
15543; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
15544; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15545; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15546; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15547; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
15548; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
15549; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15550; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15551; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
15552; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
15553; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15554; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15555; AVX512DQ-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
15556; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15557; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15558; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
15559; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
15560; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
15561; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
15562; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
15563; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15564; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15565; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15566; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
15567; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
15568; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15569; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15570; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
15571; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
15572; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15573; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15574; AVX512DQ-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
15575; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15576; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15577; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
15578; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
15579; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
15580; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
15581; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
15582; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15583; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15584; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15585; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
15586; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
15587; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15588; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15589; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
15590; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
15591; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15592; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15593; AVX512DQ-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
15594; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15595; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15596; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
15597; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
15598; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
15599; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
15600; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
15601; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15602; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15603; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15604; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
15605; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
15606; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15607; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15608; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
15609; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
15610; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15611; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15612; AVX512DQ-FCP-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
15613; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15614; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15615; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
15616; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
15617; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
15618; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
15619; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
15620; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15621; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
15622; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15623; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
15624; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
15625; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15626; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15627; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
15628; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
15629; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15630; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15631; AVX512DQ-FCP-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
15632; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
15633; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15634; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm3
15635; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm15
15636; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
15637; AVX512DQ-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15638; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
15639; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm30, %zmm0
15640; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15641; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm9
15642; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
15643; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
15644; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm30, %zmm0
15645; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15646; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
15647; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm18
15648; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
15649; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm30, %zmm1
15650; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15651; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm1
15652; AVX512DQ-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm19
15653; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
15654; AVX512DQ-FCP-NEXT:    vpermt2d %zmm19, %zmm30, %zmm10
15655; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15656; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm21
15657; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm30, %zmm21
15658; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
15659; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
15660; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm21
15661; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15662; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm21
15663; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm30, %zmm21
15664; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm10, %zmm21
15665; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15666; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm21
15667; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm30, %zmm21
15668; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm10, %zmm21
15669; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15670; AVX512DQ-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm30
15671; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm10, %zmm30
15672; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
15673; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15674; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm10, %zmm13
15675; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
15676; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
15677; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm8, %zmm13
15678; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15679; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm14
15680; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm8, %zmm14
15681; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15682; AVX512DQ-FCP-NEXT:    vpermt2d %zmm17, %zmm10, %zmm11
15683; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm11
15684; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15685; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
15686; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm12
15687; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15688; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
15689; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm10, %zmm2
15690; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15691; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
15692; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm10, %zmm2
15693; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15694; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2
15695; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm10, %zmm2
15696; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15697; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
15698; AVX512DQ-FCP-NEXT:    vpermt2d %zmm19, %zmm10, %zmm2
15699; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15700; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
15701; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
15702; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm2
15703; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm2
15704; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15705; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
15706; AVX512DQ-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15707; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
15708; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm27, %zmm2
15709; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15710; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
15711; AVX512DQ-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
15712; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
15713; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm28, %zmm2
15714; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15715; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
15716; AVX512DQ-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
15717; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
15718; AVX512DQ-FCP-NEXT:    vpermt2d %zmm16, %zmm31, %zmm2
15719; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15720; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
15721; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
15722; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm2, %zmm16
15723; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15724; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm4
15725; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
15726; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15727; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
15728; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm27, %zmm4
15729; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15730; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
15731; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm28, %zmm4
15732; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15733; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
15734; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm31, %zmm4
15735; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15736; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm18
15737; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15738; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
15739; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
15740; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
15741; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm19, %zmm25
15742; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
15743; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm27, %zmm0
15744; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15745; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm27
15746; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
15747; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm28, %zmm0
15748; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15749; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm28
15750; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
15751; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm31, %zmm0
15752; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15753; AVX512DQ-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm31
15754; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm2, %zmm19
15755; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15756; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm2, %zmm15
15757; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15758; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm0
15759; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm17
15760; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0]
15761; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm22
15762; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
15763; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0]
15764; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm23
15765; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
15766; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0]
15767; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24
15768; AVX512DQ-FCP-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
15769; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0]
15770; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29
15771; AVX512DQ-FCP-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
15772; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25]
15773; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
15774; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
15775; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15776; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26]
15777; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
15778; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
15779; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15780; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27]
15781; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm17
15782; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
15783; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
15784; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
15785; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
15786; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
15787; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm3, %zmm14
15788; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15
15789; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm4, %zmm15
15790; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
15791; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm7, %zmm16
15792; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
15793; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
15794; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15795; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
15796; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
15797; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15798; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm5
15799; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm9
15800; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm6
15801; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
15802; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm2, %zmm8
15803; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm0
15804; AVX512DQ-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm1
15805; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
15806; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
15807; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm3, %zmm10
15808; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
15809; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
15810; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm4, %zmm11
15811; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
15812; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
15813; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm7, %zmm12
15814; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
15815; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm21
15816; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm18, %zmm21
15817; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm18
15818; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm26
15819; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm26
15820; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
15821; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
15822; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm20, %zmm6
15823; AVX512DQ-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
15824; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
15825; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15826; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm13 {%k1}
15827; AVX512DQ-FCP-NEXT:    movb $-32, %al
15828; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
15829; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15830; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
15831; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15832; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
15833; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15834; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
15835; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
15836; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
15837; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15838; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
15839; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm25, %zmm2 {%k1}
15840; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15841; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
15842; AVX512DQ-FCP-NEXT:    movw $480, %ax # imm = 0x1E0
15843; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
15844; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15845; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k2}
15846; AVX512DQ-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
15847; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
15848; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15849; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
15850; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15851; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k2}
15852; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15853; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
15854; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15855; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k2}
15856; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15857; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k1}
15858; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15859; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k2}
15860; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15861; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
15862; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15863; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k2}
15864; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15865; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
15866; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15867; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k2}
15868; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15869; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k1}
15870; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15871; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k2}
15872; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15873; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
15874; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15875; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k2}
15876; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15877; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
15878; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15879; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k2}
15880; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15881; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
15882; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15883; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k2}
15884; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15885; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
15886; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15887; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k2}
15888; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15889; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
15890; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
15891; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15892; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
15893; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15894; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm21, %zmm0, %zmm0
15895; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15896; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k1}
15897; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15898; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
15899; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
15900; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm20, %zmm9 {%k1}
15901; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
15902; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload
15903; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15904; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm25, %zmm20 {%k1}
15905; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm18, %zmm28, %zmm18
15906; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15907; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm25, %zmm18 {%k1}
15908; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15909; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm26, %zmm25, %zmm25
15910; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
15911; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
15912; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
15913; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload
15914; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
15915; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm27, %zmm26 {%k1}
15916; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
15917; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload
15918; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15919; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm28, %zmm27 {%k1}
15920; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm19, %zmm31, %zmm19
15921; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm30, %zmm19 {%k1}
15922; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15923; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm28, %zmm17
15924; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15925; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm28, %zmm17 {%k1}
15926; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15927; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm28, %zmm5
15928; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15929; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm28, %zmm5 {%k1}
15930; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15931; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm1
15932; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15933; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm28, %zmm1 {%k1}
15934; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15935; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm28, %zmm6
15936; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15937; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm28, %zmm6 {%k1}
15938; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rsi)
15939; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
15940; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
15941; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, (%rsi)
15942; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
15943; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
15944; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
15945; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
15946; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
15947; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
15948; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, 64(%rcx)
15949; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 128(%rcx)
15950; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 192(%r8)
15951; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
15952; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 64(%r8)
15953; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 128(%r8)
15954; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 192(%r9)
15955; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%r9)
15956; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 64(%r9)
15957; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
15958; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15959; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 192(%rax)
15960; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, (%rax)
15961; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, 64(%rax)
15962; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 128(%rax)
15963; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
15964; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
15965; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
15966; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
15967; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
15968; AVX512DQ-FCP-NEXT:    addq $3400, %rsp # imm = 0xD48
15969; AVX512DQ-FCP-NEXT:    vzeroupper
15970; AVX512DQ-FCP-NEXT:    retq
15971;
15972; AVX512BW-LABEL: load_i32_stride7_vf64:
15973; AVX512BW:       # %bb.0:
15974; AVX512BW-NEXT:    subq $3400, %rsp # imm = 0xD48
15975; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm2
15976; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm17
15977; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm11
15978; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm7
15979; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm5
15980; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm12
15981; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
15982; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm8
15983; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm13
15984; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm20
15985; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm4
15986; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm14
15987; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
15988; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15989; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
15990; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
15991; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
15992; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
15993; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
15994; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15995; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
15996; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
15997; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
15998; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15999; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
16000; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16001; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16002; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16003; AVX512BW-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16004; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16005; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16006; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
16007; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16008; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
16009; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16010; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
16011; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16012; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16013; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16014; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
16015; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
16016; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16017; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16018; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
16019; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
16020; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16021; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16022; AVX512BW-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16023; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16024; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16025; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
16026; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16027; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
16028; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16029; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
16030; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16031; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16032; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16033; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
16034; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
16035; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16036; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16037; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
16038; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
16039; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16040; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16041; AVX512BW-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16042; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16043; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16044; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
16045; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16046; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3
16047; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
16048; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
16049; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16050; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16051; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16052; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
16053; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
16054; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16055; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16056; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
16057; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
16058; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16059; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16060; AVX512BW-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
16061; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16062; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16063; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
16064; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16065; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3
16066; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
16067; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
16068; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16069; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16070; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16071; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
16072; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
16073; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16074; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16075; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
16076; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
16077; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16078; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16079; AVX512BW-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
16080; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16081; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16082; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm3
16083; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm15
16084; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
16085; AVX512BW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16086; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
16087; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm30, %zmm0
16088; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16089; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm9
16090; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
16091; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
16092; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm30, %zmm0
16093; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16094; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
16095; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm18
16096; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
16097; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm30, %zmm1
16098; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16099; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
16100; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm19
16101; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
16102; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm30, %zmm10
16103; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16104; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm21
16105; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm30, %zmm21
16106; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
16107; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
16108; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm10, %zmm21
16109; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16110; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm21
16111; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm30, %zmm21
16112; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm10, %zmm21
16113; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16114; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm21
16115; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm30, %zmm21
16116; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm10, %zmm21
16117; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16118; AVX512BW-NEXT:    vpermi2d %zmm17, %zmm11, %zmm30
16119; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm10, %zmm30
16120; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
16121; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16122; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm10, %zmm13
16123; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
16124; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
16125; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm8, %zmm13
16126; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16127; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm10, %zmm14
16128; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm8, %zmm14
16129; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16130; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm10, %zmm11
16131; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm8, %zmm11
16132; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16133; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
16134; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm12
16135; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16136; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
16137; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm10, %zmm2
16138; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16139; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
16140; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm10, %zmm2
16141; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16142; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2
16143; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm10, %zmm2
16144; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16145; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
16146; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm10, %zmm2
16147; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16148; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
16149; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
16150; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm2
16151; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm2
16152; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16153; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
16154; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16155; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
16156; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm27, %zmm2
16157; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16158; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
16159; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
16160; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
16161; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm28, %zmm2
16162; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16163; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
16164; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
16165; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
16166; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm31, %zmm2
16167; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16168; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
16169; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
16170; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm2, %zmm16
16171; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16172; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm4
16173; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
16174; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16175; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
16176; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm27, %zmm4
16177; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16178; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
16179; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm28, %zmm4
16180; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16181; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
16182; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm31, %zmm4
16183; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16184; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm18
16185; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16186; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
16187; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
16188; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
16189; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm19, %zmm25
16190; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
16191; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm27, %zmm0
16192; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16193; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm1, %zmm27
16194; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
16195; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm28, %zmm0
16196; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16197; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm1, %zmm28
16198; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
16199; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm31, %zmm0
16200; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16201; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm1, %zmm31
16202; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm2, %zmm19
16203; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16204; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm2, %zmm15
16205; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16206; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm0
16207; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm17
16208; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0]
16209; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm22
16210; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
16211; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0]
16212; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm23
16213; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
16214; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0]
16215; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24
16216; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
16217; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0]
16218; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm29
16219; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
16220; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25]
16221; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
16222; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
16223; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16224; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26]
16225; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
16226; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
16227; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16228; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27]
16229; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm20, %zmm17
16230; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
16231; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm0
16232; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
16233; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
16234; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm14
16235; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm14
16236; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15
16237; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm4, %zmm15
16238; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16
16239; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm7, %zmm16
16240; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
16241; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
16242; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16243; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
16244; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
16245; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16246; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm20, %zmm5
16247; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm9
16248; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm6
16249; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
16250; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm2, %zmm8
16251; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
16252; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm1
16253; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
16254; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
16255; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm3, %zmm10
16256; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
16257; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm11
16258; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm4, %zmm11
16259; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
16260; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm12
16261; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm7, %zmm12
16262; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
16263; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm21
16264; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm18, %zmm21
16265; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm18
16266; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm26
16267; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm19, %zmm26
16268; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
16269; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
16270; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm20, %zmm6
16271; AVX512BW-NEXT:    movw $992, %ax # imm = 0x3E0
16272; AVX512BW-NEXT:    kmovd %eax, %k1
16273; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16274; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm13 {%k1}
16275; AVX512BW-NEXT:    movb $-32, %al
16276; AVX512BW-NEXT:    kmovd %eax, %k2
16277; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16278; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
16279; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16280; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
16281; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16282; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
16283; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
16284; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
16285; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16286; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
16287; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm2 {%k1}
16288; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16289; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
16290; AVX512BW-NEXT:    movw $480, %ax # imm = 0x1E0
16291; AVX512BW-NEXT:    kmovd %eax, %k2
16292; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16293; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k2}
16294; AVX512BW-NEXT:    movw $-512, %ax # imm = 0xFE00
16295; AVX512BW-NEXT:    kmovd %eax, %k1
16296; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16297; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
16298; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16299; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k2}
16300; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16301; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
16302; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16303; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k2}
16304; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16305; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k1}
16306; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16307; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k2}
16308; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16309; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
16310; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16311; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k2}
16312; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16313; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
16314; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16315; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k2}
16316; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16317; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k1}
16318; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16319; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k2}
16320; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16321; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
16322; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16323; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k2}
16324; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16325; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
16326; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16327; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k2}
16328; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16329; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
16330; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16331; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k2}
16332; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16333; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
16334; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16335; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k2}
16336; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16337; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
16338; AVX512BW-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
16339; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16340; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
16341; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16342; AVX512BW-NEXT:    vinserti32x4 $0, %xmm21, %zmm0, %zmm0
16343; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16344; AVX512BW-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k1}
16345; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16346; AVX512BW-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
16347; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
16348; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm9 {%k1}
16349; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
16350; AVX512BW-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload
16351; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
16352; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm20 {%k1}
16353; AVX512BW-NEXT:    vinserti32x4 $0, %xmm18, %zmm28, %zmm18
16354; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
16355; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm18 {%k1}
16356; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
16357; AVX512BW-NEXT:    vinserti32x4 $0, %xmm26, %zmm25, %zmm25
16358; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
16359; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
16360; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
16361; AVX512BW-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload
16362; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
16363; AVX512BW-NEXT:    vmovdqa32 %zmm27, %zmm26 {%k1}
16364; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
16365; AVX512BW-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload
16366; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16367; AVX512BW-NEXT:    vmovdqa32 %zmm28, %zmm27 {%k1}
16368; AVX512BW-NEXT:    vinserti32x4 $0, %xmm19, %zmm31, %zmm19
16369; AVX512BW-NEXT:    vmovdqa32 %zmm30, %zmm19 {%k1}
16370; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16371; AVX512BW-NEXT:    vinserti32x4 $0, %xmm17, %zmm28, %zmm17
16372; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16373; AVX512BW-NEXT:    vmovdqa32 %zmm28, %zmm17 {%k1}
16374; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16375; AVX512BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm28, %zmm5
16376; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16377; AVX512BW-NEXT:    vmovdqa32 %zmm28, %zmm5 {%k1}
16378; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16379; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm1
16380; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16381; AVX512BW-NEXT:    vmovdqa32 %zmm28, %zmm1 {%k1}
16382; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16383; AVX512BW-NEXT:    vinserti32x4 $0, %xmm6, %zmm28, %zmm6
16384; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16385; AVX512BW-NEXT:    vmovdqa32 %zmm28, %zmm6 {%k1}
16386; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rsi)
16387; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
16388; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
16389; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%rsi)
16390; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
16391; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
16392; AVX512BW-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
16393; AVX512BW-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
16394; AVX512BW-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
16395; AVX512BW-NEXT:    vmovdqa64 %zmm15, (%rcx)
16396; AVX512BW-NEXT:    vmovdqa64 %zmm24, 64(%rcx)
16397; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%rcx)
16398; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%r8)
16399; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%r8)
16400; AVX512BW-NEXT:    vmovdqa64 %zmm29, 64(%r8)
16401; AVX512BW-NEXT:    vmovdqa64 %zmm12, 128(%r8)
16402; AVX512BW-NEXT:    vmovdqa64 %zmm18, 192(%r9)
16403; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%r9)
16404; AVX512BW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
16405; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%r9)
16406; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16407; AVX512BW-NEXT:    vmovdqa64 %zmm19, 192(%rax)
16408; AVX512BW-NEXT:    vmovdqa64 %zmm27, (%rax)
16409; AVX512BW-NEXT:    vmovdqa64 %zmm26, 64(%rax)
16410; AVX512BW-NEXT:    vmovdqa64 %zmm25, 128(%rax)
16411; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16412; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
16413; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
16414; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
16415; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
16416; AVX512BW-NEXT:    addq $3400, %rsp # imm = 0xD48
16417; AVX512BW-NEXT:    vzeroupper
16418; AVX512BW-NEXT:    retq
16419;
16420; AVX512BW-FCP-LABEL: load_i32_stride7_vf64:
16421; AVX512BW-FCP:       # %bb.0:
16422; AVX512BW-FCP-NEXT:    subq $3400, %rsp # imm = 0xD48
16423; AVX512BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm2
16424; AVX512BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm17
16425; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm11
16426; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm7
16427; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm5
16428; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm12
16429; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
16430; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm8
16431; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm13
16432; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm20
16433; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm4
16434; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm14
16435; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
16436; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16437; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
16438; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
16439; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
16440; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16441; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16442; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16443; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
16444; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
16445; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16446; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16447; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
16448; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16449; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16450; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16451; AVX512BW-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16452; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16453; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16454; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
16455; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16456; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
16457; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16458; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
16459; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16460; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16461; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16462; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
16463; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
16464; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16465; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16466; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
16467; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
16468; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16469; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16470; AVX512BW-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16471; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16472; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16473; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
16474; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16475; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
16476; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16477; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
16478; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16479; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16480; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16481; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
16482; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
16483; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16484; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16485; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
16486; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
16487; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16488; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16489; AVX512BW-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16490; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16491; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16492; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
16493; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16494; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
16495; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
16496; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
16497; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16498; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16499; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16500; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
16501; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
16502; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16503; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16504; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
16505; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
16506; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16507; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16508; AVX512BW-FCP-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
16509; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16510; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16511; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
16512; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16513; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
16514; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
16515; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
16516; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16517; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16518; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16519; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
16520; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
16521; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16522; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16523; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
16524; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
16525; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16526; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16527; AVX512BW-FCP-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
16528; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16529; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16530; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm3
16531; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm15
16532; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
16533; AVX512BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16534; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
16535; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm30, %zmm0
16536; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16537; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm9
16538; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
16539; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
16540; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm30, %zmm0
16541; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16542; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
16543; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm18
16544; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
16545; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm30, %zmm1
16546; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16547; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm1
16548; AVX512BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm19
16549; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
16550; AVX512BW-FCP-NEXT:    vpermt2d %zmm19, %zmm30, %zmm10
16551; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16552; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm21
16553; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm30, %zmm21
16554; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
16555; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
16556; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm21
16557; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16558; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm21
16559; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm30, %zmm21
16560; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm10, %zmm21
16561; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16562; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm21
16563; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm30, %zmm21
16564; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm10, %zmm21
16565; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16566; AVX512BW-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm30
16567; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm10, %zmm30
16568; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
16569; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16570; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm10, %zmm13
16571; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
16572; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
16573; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm8, %zmm13
16574; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16575; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm14
16576; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm8, %zmm14
16577; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16578; AVX512BW-FCP-NEXT:    vpermt2d %zmm17, %zmm10, %zmm11
16579; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm11
16580; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16581; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
16582; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm12
16583; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16584; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
16585; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm10, %zmm2
16586; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16587; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
16588; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm10, %zmm2
16589; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16590; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2
16591; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm10, %zmm2
16592; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16593; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
16594; AVX512BW-FCP-NEXT:    vpermt2d %zmm19, %zmm10, %zmm2
16595; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16596; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
16597; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
16598; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm2
16599; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm2
16600; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16601; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
16602; AVX512BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16603; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
16604; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm27, %zmm2
16605; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16606; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
16607; AVX512BW-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
16608; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
16609; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm28, %zmm2
16610; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16611; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
16612; AVX512BW-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
16613; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
16614; AVX512BW-FCP-NEXT:    vpermt2d %zmm16, %zmm31, %zmm2
16615; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16616; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
16617; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
16618; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm2, %zmm16
16619; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16620; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm4
16621; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
16622; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16623; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
16624; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm27, %zmm4
16625; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16626; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
16627; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm28, %zmm4
16628; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16629; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
16630; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm31, %zmm4
16631; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16632; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm18
16633; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16634; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
16635; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
16636; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
16637; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm19, %zmm25
16638; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
16639; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm27, %zmm0
16640; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16641; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm27
16642; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
16643; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm28, %zmm0
16644; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16645; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm28
16646; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
16647; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm31, %zmm0
16648; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16649; AVX512BW-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm31
16650; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm2, %zmm19
16651; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16652; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm2, %zmm15
16653; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16654; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm0
16655; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm17
16656; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0]
16657; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm22
16658; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
16659; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0]
16660; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm23
16661; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
16662; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0]
16663; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24
16664; AVX512BW-FCP-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
16665; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0]
16666; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29
16667; AVX512BW-FCP-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
16668; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25]
16669; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
16670; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
16671; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16672; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26]
16673; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
16674; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
16675; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16676; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27]
16677; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm17
16678; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
16679; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
16680; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
16681; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
16682; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
16683; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm3, %zmm14
16684; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15
16685; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm4, %zmm15
16686; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
16687; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm7, %zmm16
16688; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
16689; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
16690; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16691; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
16692; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
16693; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16694; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm5
16695; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm9
16696; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm6
16697; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
16698; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm2, %zmm8
16699; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm0
16700; AVX512BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm1
16701; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
16702; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
16703; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm3, %zmm10
16704; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
16705; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
16706; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm4, %zmm11
16707; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
16708; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
16709; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm7, %zmm12
16710; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
16711; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm21
16712; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm18, %zmm21
16713; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm18
16714; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm26
16715; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm26
16716; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
16717; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
16718; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm20, %zmm6
16719; AVX512BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
16720; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
16721; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16722; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm13 {%k1}
16723; AVX512BW-FCP-NEXT:    movb $-32, %al
16724; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
16725; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16726; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
16727; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16728; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
16729; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16730; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
16731; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
16732; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
16733; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16734; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
16735; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm2 {%k1}
16736; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16737; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
16738; AVX512BW-FCP-NEXT:    movw $480, %ax # imm = 0x1E0
16739; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
16740; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16741; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k2}
16742; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
16743; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
16744; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16745; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
16746; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16747; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k2}
16748; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16749; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
16750; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16751; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k2}
16752; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16753; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k1}
16754; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16755; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k2}
16756; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16757; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
16758; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16759; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k2}
16760; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16761; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
16762; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16763; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k2}
16764; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16765; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k1}
16766; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16767; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k2}
16768; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16769; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
16770; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16771; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k2}
16772; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16773; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
16774; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16775; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k2}
16776; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16777; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
16778; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16779; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k2}
16780; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16781; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
16782; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16783; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k2}
16784; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16785; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
16786; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
16787; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16788; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
16789; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16790; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm21, %zmm0, %zmm0
16791; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16792; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k1}
16793; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16794; AVX512BW-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
16795; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
16796; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm9 {%k1}
16797; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
16798; AVX512BW-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload
16799; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
16800; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm20 {%k1}
16801; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm18, %zmm28, %zmm18
16802; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
16803; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm18 {%k1}
16804; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
16805; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm26, %zmm25, %zmm25
16806; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
16807; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
16808; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
16809; AVX512BW-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload
16810; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
16811; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm27, %zmm26 {%k1}
16812; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
16813; AVX512BW-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload
16814; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16815; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm27 {%k1}
16816; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm19, %zmm31, %zmm19
16817; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm30, %zmm19 {%k1}
16818; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16819; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm28, %zmm17
16820; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16821; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm17 {%k1}
16822; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16823; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm28, %zmm5
16824; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16825; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm5 {%k1}
16826; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16827; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm1
16828; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16829; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm1 {%k1}
16830; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16831; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm28, %zmm6
16832; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
16833; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm6 {%k1}
16834; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rsi)
16835; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
16836; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
16837; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%rsi)
16838; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
16839; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
16840; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
16841; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
16842; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
16843; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
16844; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, 64(%rcx)
16845; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, 128(%rcx)
16846; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 192(%r8)
16847; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
16848; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, 64(%r8)
16849; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, 128(%r8)
16850; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 192(%r9)
16851; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, (%r9)
16852; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 64(%r9)
16853; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
16854; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16855; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 192(%rax)
16856; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, (%rax)
16857; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, 64(%rax)
16858; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 128(%rax)
16859; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
16860; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
16861; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
16862; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
16863; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
16864; AVX512BW-FCP-NEXT:    addq $3400, %rsp # imm = 0xD48
16865; AVX512BW-FCP-NEXT:    vzeroupper
16866; AVX512BW-FCP-NEXT:    retq
16867;
16868; AVX512DQ-BW-LABEL: load_i32_stride7_vf64:
16869; AVX512DQ-BW:       # %bb.0:
16870; AVX512DQ-BW-NEXT:    subq $3400, %rsp # imm = 0xD48
16871; AVX512DQ-BW-NEXT:    vmovdqa64 1728(%rdi), %zmm2
16872; AVX512DQ-BW-NEXT:    vmovdqa64 1664(%rdi), %zmm17
16873; AVX512DQ-BW-NEXT:    vmovdqa64 1600(%rdi), %zmm11
16874; AVX512DQ-BW-NEXT:    vmovdqa64 1280(%rdi), %zmm7
16875; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm5
16876; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm12
16877; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
16878; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm8
16879; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm13
16880; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm20
16881; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm4
16882; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm14
16883; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
16884; AVX512DQ-BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16885; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm3
16886; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
16887; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
16888; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16889; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16890; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16891; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm3
16892; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
16893; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16894; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16895; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm3
16896; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16897; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16898; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16899; AVX512DQ-BW-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16900; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16901; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16902; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
16903; AVX512DQ-BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16904; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm3
16905; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16906; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
16907; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16908; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16909; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16910; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm3
16911; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
16912; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16913; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16914; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm3
16915; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
16916; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16917; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16918; AVX512DQ-BW-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16919; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16920; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16921; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
16922; AVX512DQ-BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16923; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm3
16924; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
16925; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
16926; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16927; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16928; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16929; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm3
16930; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
16931; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16932; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16933; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm3
16934; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
16935; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16936; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16937; AVX512DQ-BW-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
16938; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16939; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16940; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
16941; AVX512DQ-BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16942; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm3
16943; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
16944; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
16945; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16946; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16947; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16948; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm3
16949; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
16950; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16951; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16952; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm3
16953; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
16954; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16955; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16956; AVX512DQ-BW-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
16957; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16958; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16959; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
16960; AVX512DQ-BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
16961; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm3
16962; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
16963; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
16964; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
16965; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
16966; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16967; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm3
16968; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
16969; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
16970; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16971; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm3
16972; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
16973; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
16974; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16975; AVX512DQ-BW-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
16976; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
16977; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16978; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm3
16979; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm15
16980; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
16981; AVX512DQ-BW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16982; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0
16983; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm30, %zmm0
16984; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16985; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm9
16986; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
16987; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm0
16988; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm30, %zmm0
16989; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16990; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
16991; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm18
16992; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm1
16993; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm30, %zmm1
16994; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16995; AVX512DQ-BW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
16996; AVX512DQ-BW-NEXT:    vmovdqa64 1536(%rdi), %zmm19
16997; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm10
16998; AVX512DQ-BW-NEXT:    vpermt2d %zmm19, %zmm30, %zmm10
16999; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17000; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm21
17001; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm30, %zmm21
17002; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
17003; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
17004; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm10, %zmm21
17005; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17006; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm21
17007; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm30, %zmm21
17008; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm10, %zmm21
17009; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17010; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm21
17011; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm30, %zmm21
17012; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm10, %zmm21
17013; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17014; AVX512DQ-BW-NEXT:    vpermi2d %zmm17, %zmm11, %zmm30
17015; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm10, %zmm30
17016; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
17017; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17018; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm10, %zmm13
17019; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
17020; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
17021; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm8, %zmm13
17022; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17023; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm10, %zmm14
17024; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm8, %zmm14
17025; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17026; AVX512DQ-BW-NEXT:    vpermt2d %zmm17, %zmm10, %zmm11
17027; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm8, %zmm11
17028; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17029; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
17030; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm12
17031; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17032; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm2
17033; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm10, %zmm2
17034; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17035; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm2
17036; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm10, %zmm2
17037; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17038; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm2
17039; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm10, %zmm2
17040; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17041; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2
17042; AVX512DQ-BW-NEXT:    vpermt2d %zmm19, %zmm10, %zmm2
17043; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17044; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
17045; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
17046; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm2
17047; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm2
17048; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17049; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
17050; AVX512DQ-BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17051; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm2
17052; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm27, %zmm2
17053; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17054; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
17055; AVX512DQ-BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
17056; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm2
17057; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm28, %zmm2
17058; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17059; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
17060; AVX512DQ-BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
17061; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm2
17062; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm31, %zmm2
17063; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17064; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
17065; AVX512DQ-BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
17066; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm2, %zmm16
17067; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17068; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm4
17069; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
17070; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17071; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm4
17072; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm27, %zmm4
17073; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17074; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm4
17075; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm28, %zmm4
17076; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17077; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm4
17078; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm31, %zmm4
17079; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17080; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm18
17081; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17082; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm0
17083; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
17084; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
17085; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm19, %zmm25
17086; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0
17087; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm27, %zmm0
17088; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17089; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm1, %zmm27
17090; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0
17091; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm28, %zmm0
17092; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17093; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm1, %zmm28
17094; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0
17095; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm31, %zmm0
17096; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17097; AVX512DQ-BW-NEXT:    vpermi2d %zmm19, %zmm1, %zmm31
17098; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm2, %zmm19
17099; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17100; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm2, %zmm15
17101; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17102; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm0
17103; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm17
17104; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0]
17105; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm22
17106; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
17107; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0]
17108; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm23
17109; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
17110; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0]
17111; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm24
17112; AVX512DQ-BW-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
17113; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0]
17114; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm29
17115; AVX512DQ-BW-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
17116; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25]
17117; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm1
17118; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
17119; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17120; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26]
17121; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm1
17122; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
17123; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17124; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27]
17125; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm20, %zmm17
17126; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm5
17127; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm0
17128; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
17129; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
17130; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm14
17131; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm14
17132; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm15
17133; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm4, %zmm15
17134; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm16
17135; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm7, %zmm16
17136; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm1
17137; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
17138; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17139; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm1
17140; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
17141; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17142; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm20, %zmm5
17143; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm9
17144; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm6
17145; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm8
17146; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm2, %zmm8
17147; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
17148; AVX512DQ-BW-NEXT:    vmovdqa64 1344(%rdi), %zmm1
17149; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
17150; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm10
17151; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm3, %zmm10
17152; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
17153; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm11
17154; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm4, %zmm11
17155; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
17156; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm12
17157; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm7, %zmm12
17158; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
17159; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm21
17160; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm18, %zmm21
17161; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm18
17162; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm26
17163; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm19, %zmm26
17164; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
17165; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
17166; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm20, %zmm6
17167; AVX512DQ-BW-NEXT:    movw $992, %ax # imm = 0x3E0
17168; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
17169; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17170; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm13 {%k1}
17171; AVX512DQ-BW-NEXT:    movb $-32, %al
17172; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
17173; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17174; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
17175; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17176; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
17177; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17178; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
17179; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
17180; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
17181; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17182; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
17183; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm2 {%k1}
17184; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17185; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
17186; AVX512DQ-BW-NEXT:    movw $480, %ax # imm = 0x1E0
17187; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
17188; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17189; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k2}
17190; AVX512DQ-BW-NEXT:    movw $-512, %ax # imm = 0xFE00
17191; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
17192; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17193; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
17194; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17195; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k2}
17196; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17197; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
17198; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17199; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k2}
17200; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17201; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k1}
17202; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17203; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k2}
17204; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17205; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
17206; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17207; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k2}
17208; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17209; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
17210; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17211; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k2}
17212; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17213; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k1}
17214; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17215; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k2}
17216; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17217; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
17218; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17219; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k2}
17220; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17221; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
17222; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17223; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k2}
17224; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17225; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
17226; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17227; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k2}
17228; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17229; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
17230; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17231; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k2}
17232; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17233; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
17234; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
17235; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17236; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
17237; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17238; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm21, %zmm0, %zmm0
17239; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17240; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k1}
17241; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17242; AVX512DQ-BW-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
17243; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
17244; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm9 {%k1}
17245; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
17246; AVX512DQ-BW-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload
17247; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
17248; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm20 {%k1}
17249; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm18, %zmm28, %zmm18
17250; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
17251; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm18 {%k1}
17252; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
17253; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm26, %zmm25, %zmm25
17254; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
17255; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
17256; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
17257; AVX512DQ-BW-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload
17258; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
17259; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm27, %zmm26 {%k1}
17260; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
17261; AVX512DQ-BW-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload
17262; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17263; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm28, %zmm27 {%k1}
17264; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm19, %zmm31, %zmm19
17265; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm30, %zmm19 {%k1}
17266; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17267; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm17, %zmm28, %zmm17
17268; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17269; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm28, %zmm17 {%k1}
17270; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17271; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm28, %zmm5
17272; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17273; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm28, %zmm5 {%k1}
17274; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17275; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm1
17276; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17277; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm28, %zmm1 {%k1}
17278; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17279; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm6, %zmm28, %zmm6
17280; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17281; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm28, %zmm6 {%k1}
17282; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 192(%rsi)
17283; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
17284; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
17285; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, (%rsi)
17286; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
17287; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
17288; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
17289; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
17290; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
17291; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, (%rcx)
17292; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, 64(%rcx)
17293; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, 128(%rcx)
17294; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 192(%r8)
17295; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, (%r8)
17296; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, 64(%r8)
17297; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, 128(%r8)
17298; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 192(%r9)
17299; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, (%r9)
17300; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
17301; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 128(%r9)
17302; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
17303; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, 192(%rax)
17304; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, (%rax)
17305; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, 64(%rax)
17306; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, 128(%rax)
17307; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
17308; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
17309; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
17310; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, (%rax)
17311; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
17312; AVX512DQ-BW-NEXT:    addq $3400, %rsp # imm = 0xD48
17313; AVX512DQ-BW-NEXT:    vzeroupper
17314; AVX512DQ-BW-NEXT:    retq
17315;
17316; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf64:
17317; AVX512DQ-BW-FCP:       # %bb.0:
17318; AVX512DQ-BW-FCP-NEXT:    subq $3400, %rsp # imm = 0xD48
17319; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm2
17320; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm17
17321; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm11
17322; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm7
17323; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm5
17324; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm12
17325; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
17326; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm8
17327; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm13
17328; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm20
17329; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm4
17330; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm14
17331; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13]
17332; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17333; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
17334; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
17335; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25]
17336; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
17337; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
17338; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17339; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
17340; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
17341; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
17342; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17343; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
17344; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
17345; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
17346; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17347; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
17348; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
17349; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17350; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0]
17351; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
17352; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
17353; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
17354; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26]
17355; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
17356; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
17357; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17358; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
17359; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
17360; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
17361; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17362; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
17363; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
17364; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
17365; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17366; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
17367; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
17368; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17369; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
17370; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
17371; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
17372; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm1, %zmm3
17373; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27]
17374; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
17375; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
17376; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17377; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
17378; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm1, %zmm3
17379; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
17380; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17381; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
17382; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm3
17383; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
17384; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17385; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm1
17386; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
17387; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17388; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
17389; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
17390; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
17391; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
17392; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28]
17393; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
17394; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
17395; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17396; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
17397; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
17398; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
17399; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17400; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
17401; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
17402; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
17403; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17404; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
17405; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
17406; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17407; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0]
17408; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
17409; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
17410; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm1, %zmm3
17411; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29]
17412; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
17413; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm0, %zmm3
17414; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17415; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
17416; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm1, %zmm3
17417; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm3
17418; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17419; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
17420; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm1, %zmm3
17421; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm3
17422; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17423; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm11, %zmm17, %zmm1
17424; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
17425; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17426; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm3
17427; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm15
17428; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18]
17429; AVX512DQ-BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17430; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
17431; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm30, %zmm0
17432; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17433; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm9
17434; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
17435; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
17436; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm30, %zmm0
17437; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17438; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
17439; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm18
17440; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
17441; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm30, %zmm1
17442; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17443; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm1
17444; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm19
17445; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
17446; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm19, %zmm30, %zmm10
17447; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17448; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm21
17449; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm30, %zmm21
17450; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30]
17451; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
17452; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm10, %zmm21
17453; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17454; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm21
17455; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm30, %zmm21
17456; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm10, %zmm21
17457; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17458; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm21
17459; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm30, %zmm21
17460; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm10, %zmm21
17461; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17462; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm17, %zmm11, %zmm30
17463; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm10, %zmm30
17464; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
17465; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17466; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm10, %zmm13
17467; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31]
17468; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
17469; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm8, %zmm13
17470; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17471; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm14
17472; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm8, %zmm14
17473; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17474; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm17, %zmm10, %zmm11
17475; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm11
17476; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17477; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
17478; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm8, %zmm12
17479; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17480; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
17481; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm10, %zmm2
17482; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17483; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
17484; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm10, %zmm2
17485; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17486; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2
17487; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm10, %zmm2
17488; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17489; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
17490; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm19, %zmm10, %zmm2
17491; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17492; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
17493; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
17494; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm2
17495; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm2
17496; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17497; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20]
17498; AVX512DQ-BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17499; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
17500; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm27, %zmm2
17501; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17502; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21]
17503; AVX512DQ-BW-FCP-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
17504; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
17505; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm28, %zmm2
17506; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17507; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22]
17508; AVX512DQ-BW-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
17509; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
17510; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm16, %zmm31, %zmm2
17511; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17512; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7]
17513; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
17514; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm2, %zmm16
17515; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17516; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm4
17517; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
17518; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17519; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
17520; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm27, %zmm4
17521; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17522; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
17523; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm28, %zmm4
17524; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17525; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
17526; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm31, %zmm4
17527; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17528; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm18
17529; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17530; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
17531; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
17532; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
17533; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm19, %zmm25
17534; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
17535; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm27, %zmm0
17536; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17537; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm27
17538; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
17539; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm28, %zmm0
17540; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17541; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm28
17542; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
17543; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm31, %zmm0
17544; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17545; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm19, %zmm1, %zmm31
17546; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm2, %zmm19
17547; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17548; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm2, %zmm15
17549; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17550; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm0
17551; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm17
17552; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0]
17553; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm22
17554; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
17555; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0]
17556; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm23
17557; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
17558; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0]
17559; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24
17560; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
17561; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0]
17562; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29
17563; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
17564; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25]
17565; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
17566; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
17567; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17568; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26]
17569; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
17570; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
17571; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17572; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27]
17573; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm17
17574; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
17575; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
17576; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
17577; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
17578; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
17579; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm3, %zmm14
17580; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15
17581; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm4, %zmm15
17582; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
17583; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm7, %zmm16
17584; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
17585; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
17586; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17587; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
17588; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm1
17589; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17590; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm5
17591; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm9
17592; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm6
17593; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm8
17594; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm2, %zmm8
17595; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm0
17596; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm1
17597; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
17598; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10
17599; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm3, %zmm10
17600; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
17601; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
17602; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm4, %zmm11
17603; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
17604; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
17605; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm7, %zmm12
17606; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
17607; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm21
17608; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm18, %zmm21
17609; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm18
17610; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm26
17611; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm26
17612; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm19
17613; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
17614; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm20, %zmm6
17615; AVX512DQ-BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
17616; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
17617; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17618; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm13 {%k1}
17619; AVX512DQ-BW-FCP-NEXT:    movb $-32, %al
17620; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
17621; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17622; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
17623; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17624; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
17625; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17626; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
17627; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
17628; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
17629; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17630; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
17631; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm2 {%k1}
17632; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17633; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
17634; AVX512DQ-BW-FCP-NEXT:    movw $480, %ax # imm = 0x1E0
17635; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
17636; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17637; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k2}
17638; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
17639; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
17640; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17641; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
17642; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17643; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k2}
17644; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17645; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm23 {%k1}
17646; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17647; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k2}
17648; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17649; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm14 {%k1}
17650; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17651; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k2}
17652; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17653; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
17654; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17655; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k2}
17656; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17657; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
17658; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17659; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k2}
17660; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17661; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm24 {%k1}
17662; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17663; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k2}
17664; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17665; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm15 {%k1}
17666; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17667; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k2}
17668; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17669; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
17670; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17671; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k2}
17672; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17673; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
17674; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17675; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k2}
17676; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17677; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm29 {%k1}
17678; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17679; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k2}
17680; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17681; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm16 {%k1}
17682; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm27, %zmm7 {%k2}
17683; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17684; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
17685; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17686; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm21, %zmm0, %zmm0
17687; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17688; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k1}
17689; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17690; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
17691; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
17692; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm9 {%k1}
17693; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
17694; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload
17695; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
17696; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm20 {%k1}
17697; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm18, %zmm28, %zmm18
17698; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
17699; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm18 {%k1}
17700; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
17701; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm26, %zmm25, %zmm25
17702; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
17703; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm25 {%k1}
17704; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
17705; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload
17706; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
17707; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm27, %zmm26 {%k1}
17708; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
17709; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload
17710; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17711; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm27 {%k1}
17712; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm19, %zmm31, %zmm19
17713; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm30, %zmm19 {%k1}
17714; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17715; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm28, %zmm17
17716; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17717; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm17 {%k1}
17718; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17719; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm28, %zmm5
17720; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17721; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm5 {%k1}
17722; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17723; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm1
17724; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17725; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm1 {%k1}
17726; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17727; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm28, %zmm6
17728; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17729; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm28, %zmm6 {%k1}
17730; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rsi)
17731; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
17732; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
17733; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%rsi)
17734; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
17735; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
17736; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
17737; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
17738; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
17739; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
17740; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, 64(%rcx)
17741; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, 128(%rcx)
17742; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 192(%r8)
17743; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
17744; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, 64(%r8)
17745; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, 128(%r8)
17746; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 192(%r9)
17747; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, (%r9)
17748; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 64(%r9)
17749; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
17750; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
17751; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 192(%rax)
17752; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, (%rax)
17753; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, 64(%rax)
17754; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 128(%rax)
17755; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
17756; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
17757; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
17758; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
17759; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
17760; AVX512DQ-BW-FCP-NEXT:    addq $3400, %rsp # imm = 0xD48
17761; AVX512DQ-BW-FCP-NEXT:    vzeroupper
17762; AVX512DQ-BW-FCP-NEXT:    retq
17763  %wide.vec = load <448 x i32>, ptr %in.vec, align 64
17764  %strided.vec0 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441>
17765  %strided.vec1 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442>
17766  %strided.vec2 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443>
17767  %strided.vec3 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444>
17768  %strided.vec4 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445>
17769  %strided.vec5 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446>
17770  %strided.vec6 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447>
17771  store <64 x i32> %strided.vec0, ptr %out.vec0, align 64
17772  store <64 x i32> %strided.vec1, ptr %out.vec1, align 64
17773  store <64 x i32> %strided.vec2, ptr %out.vec2, align 64
17774  store <64 x i32> %strided.vec3, ptr %out.vec3, align 64
17775  store <64 x i32> %strided.vec4, ptr %out.vec4, align 64
17776  store <64 x i32> %strided.vec5, ptr %out.vec5, align 64
17777  store <64 x i32> %strided.vec6, ptr %out.vec6, align 64
17778  ret void
17779}
17780