xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll (revision 7457f51f6cf61b960e3e6e45e63378debd5c1d5c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
19; SSE-LABEL: load_i32_stride3_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movdqa (%rdi), %xmm0
22; SSE-NEXT:    movdqa 16(%rdi), %xmm1
23; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
24; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
25; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
26; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
27; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
28; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
29; SSE-NEXT:    movq %xmm2, (%rsi)
30; SSE-NEXT:    movq %xmm3, (%rdx)
31; SSE-NEXT:    movq %xmm0, (%rcx)
32; SSE-NEXT:    retq
33;
34; AVX-LABEL: load_i32_stride3_vf2:
35; AVX:       # %bb.0:
36; AVX-NEXT:    vmovaps (%rdi), %xmm0
37; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
38; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
39; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3]
40; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3]
41; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
42; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
43; AVX-NEXT:    vmovlps %xmm2, (%rsi)
44; AVX-NEXT:    vmovlps %xmm3, (%rdx)
45; AVX-NEXT:    vmovlps %xmm0, (%rcx)
46; AVX-NEXT:    retq
47;
48; AVX2-LABEL: load_i32_stride3_vf2:
49; AVX2:       # %bb.0:
50; AVX2-NEXT:    vmovaps (%rdi), %xmm0
51; AVX2-NEXT:    vmovaps 16(%rdi), %xmm1
52; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
53; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
54; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
55; AVX2-NEXT:    vbroadcastss 8(%rdi), %xmm3
56; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
57; AVX2-NEXT:    vmovlps %xmm2, (%rsi)
58; AVX2-NEXT:    vmovlps %xmm0, (%rdx)
59; AVX2-NEXT:    vmovlps %xmm1, (%rcx)
60; AVX2-NEXT:    retq
61;
62; AVX2-FP-LABEL: load_i32_stride3_vf2:
63; AVX2-FP:       # %bb.0:
64; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm0
65; AVX2-FP-NEXT:    vmovaps 16(%rdi), %xmm1
66; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
67; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
68; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
69; AVX2-FP-NEXT:    vbroadcastss 8(%rdi), %xmm3
70; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
71; AVX2-FP-NEXT:    vmovlps %xmm2, (%rsi)
72; AVX2-FP-NEXT:    vmovlps %xmm0, (%rdx)
73; AVX2-FP-NEXT:    vmovlps %xmm1, (%rcx)
74; AVX2-FP-NEXT:    retq
75;
76; AVX2-FCP-LABEL: load_i32_stride3_vf2:
77; AVX2-FCP:       # %bb.0:
78; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm0
79; AVX2-FCP-NEXT:    vmovaps 16(%rdi), %xmm1
80; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
81; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
82; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
83; AVX2-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm3
84; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
85; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rsi)
86; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rdx)
87; AVX2-FCP-NEXT:    vmovlps %xmm1, (%rcx)
88; AVX2-FCP-NEXT:    retq
89;
90; AVX512-LABEL: load_i32_stride3_vf2:
91; AVX512:       # %bb.0:
92; AVX512-NEXT:    vmovaps (%rdi), %xmm0
93; AVX512-NEXT:    vmovaps 16(%rdi), %xmm1
94; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
95; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
96; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
97; AVX512-NEXT:    vbroadcastss 8(%rdi), %xmm3
98; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
99; AVX512-NEXT:    vmovlps %xmm2, (%rsi)
100; AVX512-NEXT:    vmovlps %xmm0, (%rdx)
101; AVX512-NEXT:    vmovlps %xmm1, (%rcx)
102; AVX512-NEXT:    retq
103;
104; AVX512-FCP-LABEL: load_i32_stride3_vf2:
105; AVX512-FCP:       # %bb.0:
106; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
107; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
108; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
109; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [5,0]
110; AVX512-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
111; AVX512-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm0
112; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
113; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
114; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
115; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
116; AVX512-FCP-NEXT:    retq
117;
118; AVX512DQ-LABEL: load_i32_stride3_vf2:
119; AVX512DQ:       # %bb.0:
120; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
121; AVX512DQ-NEXT:    vmovaps 16(%rdi), %xmm1
122; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
123; AVX512DQ-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
124; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
125; AVX512DQ-NEXT:    vbroadcastss 8(%rdi), %xmm3
126; AVX512DQ-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
127; AVX512DQ-NEXT:    vmovlps %xmm2, (%rsi)
128; AVX512DQ-NEXT:    vmovlps %xmm0, (%rdx)
129; AVX512DQ-NEXT:    vmovlps %xmm1, (%rcx)
130; AVX512DQ-NEXT:    retq
131;
132; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2:
133; AVX512DQ-FCP:       # %bb.0:
134; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
135; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
136; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
137; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [5,0]
138; AVX512DQ-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
139; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm0
140; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
141; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
142; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
143; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
144; AVX512DQ-FCP-NEXT:    retq
145;
146; AVX512BW-LABEL: load_i32_stride3_vf2:
147; AVX512BW:       # %bb.0:
148; AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
149; AVX512BW-NEXT:    vmovaps 16(%rdi), %xmm1
150; AVX512BW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
151; AVX512BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
152; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
153; AVX512BW-NEXT:    vbroadcastss 8(%rdi), %xmm3
154; AVX512BW-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
155; AVX512BW-NEXT:    vmovlps %xmm2, (%rsi)
156; AVX512BW-NEXT:    vmovlps %xmm0, (%rdx)
157; AVX512BW-NEXT:    vmovlps %xmm1, (%rcx)
158; AVX512BW-NEXT:    retq
159;
160; AVX512BW-FCP-LABEL: load_i32_stride3_vf2:
161; AVX512BW-FCP:       # %bb.0:
162; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
163; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
164; AVX512BW-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
165; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [5,0]
166; AVX512BW-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
167; AVX512BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm0
168; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
169; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
170; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
171; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
172; AVX512BW-FCP-NEXT:    retq
173;
174; AVX512DQ-BW-LABEL: load_i32_stride3_vf2:
175; AVX512DQ-BW:       # %bb.0:
176; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %xmm0
177; AVX512DQ-BW-NEXT:    vmovaps 16(%rdi), %xmm1
178; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
179; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
180; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
181; AVX512DQ-BW-NEXT:    vbroadcastss 8(%rdi), %xmm3
182; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
183; AVX512DQ-BW-NEXT:    vmovlps %xmm2, (%rsi)
184; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%rdx)
185; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%rcx)
186; AVX512DQ-BW-NEXT:    retq
187;
188; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2:
189; AVX512DQ-BW-FCP:       # %bb.0:
190; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
191; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
192; AVX512DQ-BW-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
193; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [5,0]
194; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
195; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm0
196; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
197; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
198; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
199; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
200; AVX512DQ-BW-FCP-NEXT:    retq
201  %wide.vec = load <6 x i32>, ptr %in.vec, align 64
202  %strided.vec0 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 0, i32 3>
203  %strided.vec1 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 1, i32 4>
204  %strided.vec2 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 2, i32 5>
205  store <2 x i32> %strided.vec0, ptr %out.vec0, align 64
206  store <2 x i32> %strided.vec1, ptr %out.vec1, align 64
207  store <2 x i32> %strided.vec2, ptr %out.vec2, align 64
208  ret void
209}
210
211define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
212; SSE-LABEL: load_i32_stride3_vf4:
213; SSE:       # %bb.0:
214; SSE-NEXT:    movdqa (%rdi), %xmm0
215; SSE-NEXT:    movaps 16(%rdi), %xmm1
216; SSE-NEXT:    movaps 32(%rdi), %xmm2
217; SSE-NEXT:    movdqa %xmm0, %xmm3
218; SSE-NEXT:    movaps %xmm1, %xmm4
219; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
220; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
221; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1]
222; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,0]
223; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,2]
224; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
225; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
226; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
227; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,3]
228; SSE-NEXT:    movaps %xmm3, (%rsi)
229; SSE-NEXT:    movaps %xmm0, (%rdx)
230; SSE-NEXT:    movaps %xmm5, (%rcx)
231; SSE-NEXT:    retq
232;
233; AVX-LABEL: load_i32_stride3_vf4:
234; AVX:       # %bb.0:
235; AVX-NEXT:    vmovaps (%rdi), %xmm0
236; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
237; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
238; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
239; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
240; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1]
241; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
242; AVX-NEXT:    vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
243; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3]
244; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
245; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
246; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
247; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
248; AVX-NEXT:    vmovaps %xmm3, (%rsi)
249; AVX-NEXT:    vmovaps %xmm4, (%rdx)
250; AVX-NEXT:    vmovaps %xmm0, (%rcx)
251; AVX-NEXT:    retq
252;
253; AVX2-LABEL: load_i32_stride3_vf4:
254; AVX2:       # %bb.0:
255; AVX2-NEXT:    vmovaps (%rdi), %ymm0
256; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
257; AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [0,3,6,1]
258; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
259; AVX2-NEXT:    vpermps %ymm3, %ymm2, %ymm2
260; AVX2-NEXT:    vmovaps {{.*#+}} xmm3 = [1,4,7,2]
261; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
262; AVX2-NEXT:    vpermps %ymm4, %ymm3, %ymm3
263; AVX2-NEXT:    vmovaps {{.*#+}} xmm4 = [2,5,0,3]
264; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7]
265; AVX2-NEXT:    vpermps %ymm0, %ymm4, %ymm0
266; AVX2-NEXT:    vmovaps %xmm2, (%rsi)
267; AVX2-NEXT:    vmovaps %xmm3, (%rdx)
268; AVX2-NEXT:    vmovaps %xmm0, (%rcx)
269; AVX2-NEXT:    vzeroupper
270; AVX2-NEXT:    retq
271;
272; AVX2-FP-LABEL: load_i32_stride3_vf4:
273; AVX2-FP:       # %bb.0:
274; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
275; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
276; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm2 = [0,3,6,1]
277; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
278; AVX2-FP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
279; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm3 = [1,4,7,2]
280; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
281; AVX2-FP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
282; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm4 = [2,5,0,3]
283; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7]
284; AVX2-FP-NEXT:    vpermps %ymm0, %ymm4, %ymm0
285; AVX2-FP-NEXT:    vmovaps %xmm2, (%rsi)
286; AVX2-FP-NEXT:    vmovaps %xmm3, (%rdx)
287; AVX2-FP-NEXT:    vmovaps %xmm0, (%rcx)
288; AVX2-FP-NEXT:    vzeroupper
289; AVX2-FP-NEXT:    retq
290;
291; AVX2-FCP-LABEL: load_i32_stride3_vf4:
292; AVX2-FCP:       # %bb.0:
293; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
294; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
295; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm2 = [0,3,6,1]
296; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
297; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
298; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm3 = [1,4,7,2]
299; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
300; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
301; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm4 = [2,5,0,3]
302; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7]
303; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm4, %ymm0
304; AVX2-FCP-NEXT:    vmovaps %xmm2, (%rsi)
305; AVX2-FCP-NEXT:    vmovaps %xmm3, (%rdx)
306; AVX2-FCP-NEXT:    vmovaps %xmm0, (%rcx)
307; AVX2-FCP-NEXT:    vzeroupper
308; AVX2-FCP-NEXT:    retq
309;
310; AVX512-LABEL: load_i32_stride3_vf4:
311; AVX512:       # %bb.0:
312; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
313; AVX512-NEXT:    vmovaps (%rdi), %zmm1
314; AVX512-NEXT:    vpermps %zmm1, %zmm0, %zmm0
315; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
316; AVX512-NEXT:    vpermps %zmm1, %zmm2, %zmm2
317; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
318; AVX512-NEXT:    vpermps %zmm1, %zmm3, %zmm1
319; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
320; AVX512-NEXT:    vmovaps %xmm2, (%rdx)
321; AVX512-NEXT:    vmovaps %xmm1, (%rcx)
322; AVX512-NEXT:    vzeroupper
323; AVX512-NEXT:    retq
324;
325; AVX512-FCP-LABEL: load_i32_stride3_vf4:
326; AVX512-FCP:       # %bb.0:
327; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
328; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm1
329; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
330; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
331; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
332; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
333; AVX512-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm1
334; AVX512-FCP-NEXT:    vmovaps %xmm0, (%rsi)
335; AVX512-FCP-NEXT:    vmovaps %xmm2, (%rdx)
336; AVX512-FCP-NEXT:    vmovaps %xmm1, (%rcx)
337; AVX512-FCP-NEXT:    vzeroupper
338; AVX512-FCP-NEXT:    retq
339;
340; AVX512DQ-LABEL: load_i32_stride3_vf4:
341; AVX512DQ:       # %bb.0:
342; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
343; AVX512DQ-NEXT:    vmovaps (%rdi), %zmm1
344; AVX512DQ-NEXT:    vpermps %zmm1, %zmm0, %zmm0
345; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
346; AVX512DQ-NEXT:    vpermps %zmm1, %zmm2, %zmm2
347; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
348; AVX512DQ-NEXT:    vpermps %zmm1, %zmm3, %zmm1
349; AVX512DQ-NEXT:    vmovaps %xmm0, (%rsi)
350; AVX512DQ-NEXT:    vmovaps %xmm2, (%rdx)
351; AVX512DQ-NEXT:    vmovaps %xmm1, (%rcx)
352; AVX512DQ-NEXT:    vzeroupper
353; AVX512DQ-NEXT:    retq
354;
355; AVX512DQ-FCP-LABEL: load_i32_stride3_vf4:
356; AVX512DQ-FCP:       # %bb.0:
357; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
358; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm1
359; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
360; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
361; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
362; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
363; AVX512DQ-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm1
364; AVX512DQ-FCP-NEXT:    vmovaps %xmm0, (%rsi)
365; AVX512DQ-FCP-NEXT:    vmovaps %xmm2, (%rdx)
366; AVX512DQ-FCP-NEXT:    vmovaps %xmm1, (%rcx)
367; AVX512DQ-FCP-NEXT:    vzeroupper
368; AVX512DQ-FCP-NEXT:    retq
369;
370; AVX512BW-LABEL: load_i32_stride3_vf4:
371; AVX512BW:       # %bb.0:
372; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
373; AVX512BW-NEXT:    vmovaps (%rdi), %zmm1
374; AVX512BW-NEXT:    vpermps %zmm1, %zmm0, %zmm0
375; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
376; AVX512BW-NEXT:    vpermps %zmm1, %zmm2, %zmm2
377; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
378; AVX512BW-NEXT:    vpermps %zmm1, %zmm3, %zmm1
379; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
380; AVX512BW-NEXT:    vmovaps %xmm2, (%rdx)
381; AVX512BW-NEXT:    vmovaps %xmm1, (%rcx)
382; AVX512BW-NEXT:    vzeroupper
383; AVX512BW-NEXT:    retq
384;
385; AVX512BW-FCP-LABEL: load_i32_stride3_vf4:
386; AVX512BW-FCP:       # %bb.0:
387; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
388; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
389; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
390; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
391; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
392; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
393; AVX512BW-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm1
394; AVX512BW-FCP-NEXT:    vmovaps %xmm0, (%rsi)
395; AVX512BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
396; AVX512BW-FCP-NEXT:    vmovaps %xmm1, (%rcx)
397; AVX512BW-FCP-NEXT:    vzeroupper
398; AVX512BW-FCP-NEXT:    retq
399;
400; AVX512DQ-BW-LABEL: load_i32_stride3_vf4:
401; AVX512DQ-BW:       # %bb.0:
402; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
403; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %zmm1
404; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm0, %zmm0
405; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
406; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm2, %zmm2
407; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
408; AVX512DQ-BW-NEXT:    vpermps %zmm1, %zmm3, %zmm1
409; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rsi)
410; AVX512DQ-BW-NEXT:    vmovaps %xmm2, (%rdx)
411; AVX512DQ-BW-NEXT:    vmovaps %xmm1, (%rcx)
412; AVX512DQ-BW-NEXT:    vzeroupper
413; AVX512DQ-BW-NEXT:    retq
414;
415; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf4:
416; AVX512DQ-BW-FCP:       # %bb.0:
417; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
418; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
419; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm0, %zmm0
420; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10]
421; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm2, %zmm2
422; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11]
423; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm1, %zmm3, %zmm1
424; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm0, (%rsi)
425; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
426; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm1, (%rcx)
427; AVX512DQ-BW-FCP-NEXT:    vzeroupper
428; AVX512DQ-BW-FCP-NEXT:    retq
429  %wide.vec = load <12 x i32>, ptr %in.vec, align 64
430  %strided.vec0 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
431  %strided.vec1 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
432  %strided.vec2 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
433  store <4 x i32> %strided.vec0, ptr %out.vec0, align 64
434  store <4 x i32> %strided.vec1, ptr %out.vec1, align 64
435  store <4 x i32> %strided.vec2, ptr %out.vec2, align 64
436  ret void
437}
438
439define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
440; SSE-LABEL: load_i32_stride3_vf8:
441; SSE:       # %bb.0:
442; SSE-NEXT:    movaps 80(%rdi), %xmm1
443; SSE-NEXT:    movaps 64(%rdi), %xmm5
444; SSE-NEXT:    movdqa (%rdi), %xmm0
445; SSE-NEXT:    movaps 16(%rdi), %xmm7
446; SSE-NEXT:    movaps 32(%rdi), %xmm4
447; SSE-NEXT:    movdqa 48(%rdi), %xmm2
448; SSE-NEXT:    movdqa %xmm0, %xmm3
449; SSE-NEXT:    movaps %xmm7, %xmm8
450; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
451; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0]
452; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
453; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[1,0]
454; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,2]
455; SSE-NEXT:    movdqa %xmm2, %xmm7
456; SSE-NEXT:    movaps %xmm5, %xmm10
457; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3]
458; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0]
459; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1]
460; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[1,0]
461; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,2]
462; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,3]
463; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2]
464; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,1],xmm4[2,3]
465; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2]
466; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
467; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3]
468; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
469; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,3]
470; SSE-NEXT:    movaps %xmm7, 16(%rsi)
471; SSE-NEXT:    movaps %xmm3, (%rsi)
472; SSE-NEXT:    movaps %xmm2, 16(%rdx)
473; SSE-NEXT:    movaps %xmm0, (%rdx)
474; SSE-NEXT:    movaps %xmm11, 16(%rcx)
475; SSE-NEXT:    movaps %xmm6, (%rcx)
476; SSE-NEXT:    retq
477;
478; AVX-LABEL: load_i32_stride3_vf8:
479; AVX:       # %bb.0:
480; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
481; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
482; AVX-NEXT:    vmovaps (%rdi), %ymm2
483; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
484; AVX-NEXT:    vmovaps 16(%rdi), %xmm4
485; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
486; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
487; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
488; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
489; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
490; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
491; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4]
492; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4]
493; AVX-NEXT:    vmovaps 16(%rdi), %xmm6
494; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
495; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
496; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
497; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
498; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
499; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
500; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7]
501; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7]
502; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
503; AVX-NEXT:    vmovaps %ymm3, (%rsi)
504; AVX-NEXT:    vmovaps %ymm5, (%rdx)
505; AVX-NEXT:    vmovaps %ymm0, (%rcx)
506; AVX-NEXT:    vzeroupper
507; AVX-NEXT:    retq
508;
509; AVX2-LABEL: load_i32_stride3_vf8:
510; AVX2:       # %bb.0:
511; AVX2-NEXT:    vmovaps (%rdi), %ymm0
512; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
513; AVX2-NEXT:    vmovaps 64(%rdi), %ymm2
514; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
515; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
516; AVX2-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
517; AVX2-NEXT:    vpermps %ymm3, %ymm4, %ymm3
518; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
519; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
520; AVX2-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
521; AVX2-NEXT:    vpermps %ymm4, %ymm5, %ymm4
522; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
523; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
524; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
525; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
526; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
527; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
528; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
529; AVX2-NEXT:    vmovaps %ymm4, (%rdx)
530; AVX2-NEXT:    vmovaps %ymm0, (%rcx)
531; AVX2-NEXT:    vzeroupper
532; AVX2-NEXT:    retq
533;
534; AVX2-FP-LABEL: load_i32_stride3_vf8:
535; AVX2-FP:       # %bb.0:
536; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
537; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
538; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm2
539; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
540; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
541; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
542; AVX2-FP-NEXT:    vpermps %ymm3, %ymm4, %ymm3
543; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
544; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
545; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
546; AVX2-FP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
547; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
548; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
549; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
550; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
551; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
552; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
553; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
554; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
555; AVX2-FP-NEXT:    vmovaps %ymm0, (%rcx)
556; AVX2-FP-NEXT:    vzeroupper
557; AVX2-FP-NEXT:    retq
558;
559; AVX2-FCP-LABEL: load_i32_stride3_vf8:
560; AVX2-FCP:       # %bb.0:
561; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
562; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
563; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm2
564; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
565; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
566; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
567; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm4, %ymm3
568; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
569; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
570; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
571; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
572; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
573; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
574; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7]
575; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
576; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rsi)
577; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
578; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rcx)
579; AVX2-FCP-NEXT:    vzeroupper
580; AVX2-FCP-NEXT:    retq
581;
582; AVX512-LABEL: load_i32_stride3_vf8:
583; AVX512:       # %bb.0:
584; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
585; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
586; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
587; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
588; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
589; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
590; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
591; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
592; AVX512-NEXT:    vmovdqa %ymm2, (%rsi)
593; AVX512-NEXT:    vmovdqa %ymm3, (%rdx)
594; AVX512-NEXT:    vmovdqa %ymm4, (%rcx)
595; AVX512-NEXT:    vzeroupper
596; AVX512-NEXT:    retq
597;
598; AVX512-FCP-LABEL: load_i32_stride3_vf8:
599; AVX512-FCP:       # %bb.0:
600; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
601; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
602; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
603; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
604; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
605; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
606; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
607; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
608; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
609; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
610; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
611; AVX512-FCP-NEXT:    vzeroupper
612; AVX512-FCP-NEXT:    retq
613;
614; AVX512DQ-LABEL: load_i32_stride3_vf8:
615; AVX512DQ:       # %bb.0:
616; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
617; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm1
618; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
619; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
620; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
621; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
622; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
623; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
624; AVX512DQ-NEXT:    vmovdqa %ymm2, (%rsi)
625; AVX512DQ-NEXT:    vmovdqa %ymm3, (%rdx)
626; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rcx)
627; AVX512DQ-NEXT:    vzeroupper
628; AVX512DQ-NEXT:    retq
629;
630; AVX512DQ-FCP-LABEL: load_i32_stride3_vf8:
631; AVX512DQ-FCP:       # %bb.0:
632; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
633; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
634; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
635; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
636; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
637; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
638; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
639; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
640; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
641; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
642; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
643; AVX512DQ-FCP-NEXT:    vzeroupper
644; AVX512DQ-FCP-NEXT:    retq
645;
646; AVX512BW-LABEL: load_i32_stride3_vf8:
647; AVX512BW:       # %bb.0:
648; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
649; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
650; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
651; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
652; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
653; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
654; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
655; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
656; AVX512BW-NEXT:    vmovdqa %ymm2, (%rsi)
657; AVX512BW-NEXT:    vmovdqa %ymm3, (%rdx)
658; AVX512BW-NEXT:    vmovdqa %ymm4, (%rcx)
659; AVX512BW-NEXT:    vzeroupper
660; AVX512BW-NEXT:    retq
661;
662; AVX512BW-FCP-LABEL: load_i32_stride3_vf8:
663; AVX512BW-FCP:       # %bb.0:
664; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
665; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
666; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
667; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
668; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
669; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
670; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
671; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
672; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
673; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
674; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
675; AVX512BW-FCP-NEXT:    vzeroupper
676; AVX512BW-FCP-NEXT:    retq
677;
678; AVX512DQ-BW-LABEL: load_i32_stride3_vf8:
679; AVX512DQ-BW:       # %bb.0:
680; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
681; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
682; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
683; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
684; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
685; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
686; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
687; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
688; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%rsi)
689; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%rdx)
690; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rcx)
691; AVX512DQ-BW-NEXT:    vzeroupper
692; AVX512DQ-BW-NEXT:    retq
693;
694; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf8:
695; AVX512DQ-BW-FCP:       # %bb.0:
696; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
697; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
698; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
699; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
700; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
701; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
702; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
703; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
704; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
705; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
706; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
707; AVX512DQ-BW-FCP-NEXT:    vzeroupper
708; AVX512DQ-BW-FCP-NEXT:    retq
709  %wide.vec = load <24 x i32>, ptr %in.vec, align 64
710  %strided.vec0 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
711  %strided.vec1 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
712  %strided.vec2 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
713  store <8 x i32> %strided.vec0, ptr %out.vec0, align 64
714  store <8 x i32> %strided.vec1, ptr %out.vec1, align 64
715  store <8 x i32> %strided.vec2, ptr %out.vec2, align 64
716  ret void
717}
718
719define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
720; SSE-LABEL: load_i32_stride3_vf16:
721; SSE:       # %bb.0:
722; SSE-NEXT:    movaps 96(%rdi), %xmm6
723; SSE-NEXT:    movaps 128(%rdi), %xmm12
724; SSE-NEXT:    movaps 112(%rdi), %xmm13
725; SSE-NEXT:    movaps 144(%rdi), %xmm11
726; SSE-NEXT:    movaps 176(%rdi), %xmm10
727; SSE-NEXT:    movaps 160(%rdi), %xmm9
728; SSE-NEXT:    movaps (%rdi), %xmm7
729; SSE-NEXT:    movaps 16(%rdi), %xmm8
730; SSE-NEXT:    movaps 32(%rdi), %xmm3
731; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
732; SSE-NEXT:    movaps 48(%rdi), %xmm15
733; SSE-NEXT:    movaps 80(%rdi), %xmm14
734; SSE-NEXT:    movaps 64(%rdi), %xmm2
735; SSE-NEXT:    movaps %xmm2, %xmm0
736; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0]
737; SSE-NEXT:    movaps %xmm15, %xmm5
738; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
739; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
740; SSE-NEXT:    movaps %xmm8, %xmm0
741; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
742; SSE-NEXT:    movaps %xmm7, %xmm5
743; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
744; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
745; SSE-NEXT:    movaps %xmm9, %xmm0
746; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0]
747; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
748; SSE-NEXT:    movaps %xmm11, %xmm3
749; SSE-NEXT:    movaps %xmm11, %xmm4
750; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
751; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2]
752; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
753; SSE-NEXT:    movaps %xmm13, %xmm0
754; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0]
755; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
756; SSE-NEXT:    movaps %xmm6, %xmm5
757; SSE-NEXT:    movaps %xmm6, %xmm3
758; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
759; SSE-NEXT:    movaps %xmm15, %xmm11
760; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[0,0]
761; SSE-NEXT:    movaps %xmm2, %xmm0
762; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3]
763; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
764; SSE-NEXT:    movaps %xmm4, %xmm6
765; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,0],xmm9[0,0]
766; SSE-NEXT:    movaps %xmm9, %xmm0
767; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3]
768; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2]
769; SSE-NEXT:    movaps %xmm3, %xmm1
770; SSE-NEXT:    movaps %xmm3, %xmm10
771; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0]
772; SSE-NEXT:    movaps %xmm13, %xmm0
773; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3]
774; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
775; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
776; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm8[0,0]
777; SSE-NEXT:    movaps %xmm8, %xmm12
778; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
779; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[2,3]
780; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[0,2]
781; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,1,1]
782; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3]
783; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1]
784; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3]
785; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1]
786; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
787; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
788; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
789; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
790; SSE-NEXT:    # xmm8 = mem[2,3,2,3]
791; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
792; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
793; SSE-NEXT:    # xmm8 = xmm8[0,1],mem[0,3]
794; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1]
795; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3]
796; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
797; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
798; SSE-NEXT:    # xmm4 = xmm4[0,1],mem[0,3]
799; SSE-NEXT:    movaps %xmm5, 32(%rsi)
800; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
801; SSE-NEXT:    movaps %xmm3, 48(%rsi)
802; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
803; SSE-NEXT:    movaps %xmm3, (%rsi)
804; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
805; SSE-NEXT:    movaps %xmm3, 16(%rsi)
806; SSE-NEXT:    movaps %xmm1, 32(%rdx)
807; SSE-NEXT:    movaps %xmm6, 48(%rdx)
808; SSE-NEXT:    movaps %xmm7, (%rdx)
809; SSE-NEXT:    movaps %xmm11, 16(%rdx)
810; SSE-NEXT:    movaps %xmm4, 32(%rcx)
811; SSE-NEXT:    movaps %xmm8, 48(%rcx)
812; SSE-NEXT:    movaps %xmm0, (%rcx)
813; SSE-NEXT:    movaps %xmm2, 16(%rcx)
814; SSE-NEXT:    retq
815;
816; AVX-LABEL: load_i32_stride3_vf16:
817; AVX:       # %bb.0:
818; AVX-NEXT:    vmovaps 160(%rdi), %ymm0
819; AVX-NEXT:    vmovaps 128(%rdi), %ymm1
820; AVX-NEXT:    vmovaps 96(%rdi), %ymm2
821; AVX-NEXT:    vmovaps 64(%rdi), %ymm3
822; AVX-NEXT:    vmovaps 32(%rdi), %ymm4
823; AVX-NEXT:    vmovaps (%rdi), %ymm6
824; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
825; AVX-NEXT:    vmovaps 16(%rdi), %xmm7
826; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,1],ymm4[1,3],ymm7[6,5],ymm4[5,7]
827; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6]
828; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1]
829; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4]
830; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
831; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
832; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
833; AVX-NEXT:    vmovaps 112(%rdi), %xmm9
834; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7]
835; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6]
836; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1]
837; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4]
838; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
839; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
840; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm3[2,0],ymm7[3,0],ymm3[6,4],ymm7[7,4]
841; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4]
842; AVX-NEXT:    vmovaps 16(%rdi), %xmm11
843; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
844; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7]
845; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
846; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7]
847; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4]
848; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm12[2,0],ymm9[4,4],ymm12[6,4]
849; AVX-NEXT:    vmovaps 112(%rdi), %xmm13
850; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
851; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7]
852; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
853; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7]
854; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
855; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm6[2,0],ymm11[5,4],ymm6[6,4]
856; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[0,3],ymm6[6,4],ymm4[4,7]
857; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[0,3],ymm7[4,5],ymm3[4,7]
858; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
859; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
860; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[1,0],ymm2[2,0],ymm13[5,4],ymm2[6,4]
861; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7]
862; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm0[0,3],ymm9[4,5],ymm0[4,7]
863; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
864; AVX-NEXT:    vmovaps %ymm8, 32(%rsi)
865; AVX-NEXT:    vmovaps %ymm5, (%rsi)
866; AVX-NEXT:    vmovaps %ymm12, 32(%rdx)
867; AVX-NEXT:    vmovaps %ymm10, (%rdx)
868; AVX-NEXT:    vmovaps %ymm0, 32(%rcx)
869; AVX-NEXT:    vmovaps %ymm3, (%rcx)
870; AVX-NEXT:    vzeroupper
871; AVX-NEXT:    retq
872;
873; AVX2-LABEL: load_i32_stride3_vf16:
874; AVX2:       # %bb.0:
875; AVX2-NEXT:    vmovaps 160(%rdi), %ymm0
876; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
877; AVX2-NEXT:    vmovaps (%rdi), %ymm2
878; AVX2-NEXT:    vmovaps 32(%rdi), %ymm3
879; AVX2-NEXT:    vmovaps 64(%rdi), %ymm4
880; AVX2-NEXT:    vmovaps 96(%rdi), %ymm5
881; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
882; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
883; AVX2-NEXT:    vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
884; AVX2-NEXT:    vpermps %ymm6, %ymm7, %ymm6
885; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
886; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
887; AVX2-NEXT:    vpermps %ymm8, %ymm7, %ymm7
888; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
889; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
890; AVX2-NEXT:    vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
891; AVX2-NEXT:    vpermps %ymm8, %ymm9, %ymm8
892; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
893; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
894; AVX2-NEXT:    vpermps %ymm10, %ymm9, %ymm9
895; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
896; AVX2-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
897; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
898; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
899; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
900; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
901; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
902; AVX2-NEXT:    vpermps %ymm1, %ymm3, %ymm1
903; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
904; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
905; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
906; AVX2-NEXT:    vmovaps %ymm7, 32(%rsi)
907; AVX2-NEXT:    vmovaps %ymm6, (%rsi)
908; AVX2-NEXT:    vmovaps %ymm9, 32(%rdx)
909; AVX2-NEXT:    vmovaps %ymm8, (%rdx)
910; AVX2-NEXT:    vmovaps %ymm0, 32(%rcx)
911; AVX2-NEXT:    vmovaps %ymm2, (%rcx)
912; AVX2-NEXT:    vzeroupper
913; AVX2-NEXT:    retq
914;
915; AVX2-FP-LABEL: load_i32_stride3_vf16:
916; AVX2-FP:       # %bb.0:
917; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm0
918; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
919; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm2
920; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm3
921; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm4
922; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm5
923; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
924; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
925; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
926; AVX2-FP-NEXT:    vpermps %ymm6, %ymm7, %ymm6
927; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
928; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
929; AVX2-FP-NEXT:    vpermps %ymm8, %ymm7, %ymm7
930; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
931; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
932; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
933; AVX2-FP-NEXT:    vpermps %ymm8, %ymm9, %ymm8
934; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
935; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
936; AVX2-FP-NEXT:    vpermps %ymm10, %ymm9, %ymm9
937; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
938; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
939; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
940; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
941; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
942; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
943; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
944; AVX2-FP-NEXT:    vpermps %ymm1, %ymm3, %ymm1
945; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
946; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
947; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
948; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
949; AVX2-FP-NEXT:    vmovaps %ymm6, (%rsi)
950; AVX2-FP-NEXT:    vmovaps %ymm9, 32(%rdx)
951; AVX2-FP-NEXT:    vmovaps %ymm8, (%rdx)
952; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rcx)
953; AVX2-FP-NEXT:    vmovaps %ymm2, (%rcx)
954; AVX2-FP-NEXT:    vzeroupper
955; AVX2-FP-NEXT:    retq
956;
957; AVX2-FCP-LABEL: load_i32_stride3_vf16:
958; AVX2-FCP:       # %bb.0:
959; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm0
960; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm1
961; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm2
962; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
963; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm4
964; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm5
965; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
966; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
967; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
968; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm7, %ymm6
969; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
970; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
971; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm7, %ymm7
972; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
973; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
974; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
975; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm9, %ymm8
976; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
977; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
978; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm9, %ymm9
979; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
980; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
981; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,1,4,7]
982; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
983; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
984; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
985; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm3, %ymm0
986; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rsi)
987; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rsi)
988; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%rdx)
989; AVX2-FCP-NEXT:    vmovaps %ymm8, (%rdx)
990; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rcx)
991; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rcx)
992; AVX2-FCP-NEXT:    vzeroupper
993; AVX2-FCP-NEXT:    retq
994;
995; AVX512-LABEL: load_i32_stride3_vf16:
996; AVX512:       # %bb.0:
997; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
998; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
999; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm2
1000; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1001; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1002; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1003; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1004; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1005; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1006; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1007; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1008; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1009; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1010; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1011; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1012; AVX512-NEXT:    vmovdqa64 %zmm4, (%rsi)
1013; AVX512-NEXT:    vmovdqa64 %zmm5, (%rdx)
1014; AVX512-NEXT:    vmovdqa64 %zmm0, (%rcx)
1015; AVX512-NEXT:    vzeroupper
1016; AVX512-NEXT:    retq
1017;
1018; AVX512-FCP-LABEL: load_i32_stride3_vf16:
1019; AVX512-FCP:       # %bb.0:
1020; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1021; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1022; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1023; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1024; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1025; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1026; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1027; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1028; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1029; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1030; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1031; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1032; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1033; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1034; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1035; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rsi)
1036; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
1037; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rcx)
1038; AVX512-FCP-NEXT:    vzeroupper
1039; AVX512-FCP-NEXT:    retq
1040;
1041; AVX512DQ-LABEL: load_i32_stride3_vf16:
1042; AVX512DQ:       # %bb.0:
1043; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
1044; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm1
1045; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm2
1046; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1047; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1048; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1049; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1050; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1051; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1052; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1053; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1054; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1055; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1056; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1057; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1058; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%rsi)
1059; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rdx)
1060; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rcx)
1061; AVX512DQ-NEXT:    vzeroupper
1062; AVX512DQ-NEXT:    retq
1063;
1064; AVX512DQ-FCP-LABEL: load_i32_stride3_vf16:
1065; AVX512DQ-FCP:       # %bb.0:
1066; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1067; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1068; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1069; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1070; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1071; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1072; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1073; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1074; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1075; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1076; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1077; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1078; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1079; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1080; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1081; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rsi)
1082; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
1083; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rcx)
1084; AVX512DQ-FCP-NEXT:    vzeroupper
1085; AVX512DQ-FCP-NEXT:    retq
1086;
1087; AVX512BW-LABEL: load_i32_stride3_vf16:
1088; AVX512BW:       # %bb.0:
1089; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1090; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1091; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
1092; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1093; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1094; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1095; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1096; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1097; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1098; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1099; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1100; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1101; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1102; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1103; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1104; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rsi)
1105; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rdx)
1106; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1107; AVX512BW-NEXT:    vzeroupper
1108; AVX512BW-NEXT:    retq
1109;
1110; AVX512BW-FCP-LABEL: load_i32_stride3_vf16:
1111; AVX512BW-FCP:       # %bb.0:
1112; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1113; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1114; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1115; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1116; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1117; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1118; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1119; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1120; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1121; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1122; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1123; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1124; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1125; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1126; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1127; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rsi)
1128; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
1129; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rcx)
1130; AVX512BW-FCP-NEXT:    vzeroupper
1131; AVX512BW-FCP-NEXT:    retq
1132;
1133; AVX512DQ-BW-LABEL: load_i32_stride3_vf16:
1134; AVX512DQ-BW:       # %bb.0:
1135; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1136; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1137; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
1138; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1139; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1140; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1141; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1142; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1143; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1144; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1145; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1146; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1147; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1148; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1149; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1150; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, (%rsi)
1151; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, (%rdx)
1152; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1153; AVX512DQ-BW-NEXT:    vzeroupper
1154; AVX512DQ-BW-NEXT:    retq
1155;
1156; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf16:
1157; AVX512DQ-BW-FCP:       # %bb.0:
1158; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1159; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1160; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
1161; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1162; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1163; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1164; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1165; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1166; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1167; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1168; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1169; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1170; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1171; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1172; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1173; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rsi)
1174; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
1175; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rcx)
1176; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1177; AVX512DQ-BW-FCP-NEXT:    retq
1178  %wide.vec = load <48 x i32>, ptr %in.vec, align 64
1179  %strided.vec0 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1180  %strided.vec1 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1181  %strided.vec2 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1182  store <16 x i32> %strided.vec0, ptr %out.vec0, align 64
1183  store <16 x i32> %strided.vec1, ptr %out.vec1, align 64
1184  store <16 x i32> %strided.vec2, ptr %out.vec2, align 64
1185  ret void
1186}
1187
1188define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
1189; SSE-LABEL: load_i32_stride3_vf32:
1190; SSE:       # %bb.0:
1191; SSE-NEXT:    subq $392, %rsp # imm = 0x188
1192; SSE-NEXT:    movaps 192(%rdi), %xmm4
1193; SSE-NEXT:    movaps 224(%rdi), %xmm3
1194; SSE-NEXT:    movaps 208(%rdi), %xmm14
1195; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1196; SSE-NEXT:    movaps 240(%rdi), %xmm7
1197; SSE-NEXT:    movaps 272(%rdi), %xmm10
1198; SSE-NEXT:    movaps 256(%rdi), %xmm9
1199; SSE-NEXT:    movaps (%rdi), %xmm13
1200; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1201; SSE-NEXT:    movaps 16(%rdi), %xmm8
1202; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1203; SSE-NEXT:    movaps 32(%rdi), %xmm11
1204; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1205; SSE-NEXT:    movaps 48(%rdi), %xmm2
1206; SSE-NEXT:    movaps 80(%rdi), %xmm12
1207; SSE-NEXT:    movaps 64(%rdi), %xmm5
1208; SSE-NEXT:    movaps %xmm5, %xmm0
1209; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0]
1210; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1211; SSE-NEXT:    movaps %xmm2, %xmm1
1212; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1213; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1214; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1215; SSE-NEXT:    movaps %xmm9, %xmm0
1216; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0]
1217; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1218; SSE-NEXT:    movaps %xmm7, %xmm1
1219; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1220; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1221; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1222; SSE-NEXT:    movaps %xmm8, %xmm0
1223; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0]
1224; SSE-NEXT:    movaps %xmm13, %xmm1
1225; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1226; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1227; SSE-NEXT:    movaps %xmm14, %xmm0
1228; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
1229; SSE-NEXT:    movaps %xmm3, %xmm13
1230; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1231; SSE-NEXT:    movaps %xmm4, %xmm1
1232; SSE-NEXT:    movaps %xmm4, %xmm11
1233; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1234; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1235; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1236; SSE-NEXT:    movaps 176(%rdi), %xmm1
1237; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1238; SSE-NEXT:    movaps 160(%rdi), %xmm0
1239; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1240; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
1241; SSE-NEXT:    movaps 144(%rdi), %xmm3
1242; SSE-NEXT:    movaps %xmm3, %xmm1
1243; SSE-NEXT:    movaps %xmm3, (%rsp) # 16-byte Spill
1244; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1245; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1246; SSE-NEXT:    movaps 368(%rdi), %xmm1
1247; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1248; SSE-NEXT:    movaps 352(%rdi), %xmm0
1249; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1250; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
1251; SSE-NEXT:    movaps 336(%rdi), %xmm4
1252; SSE-NEXT:    movaps %xmm4, %xmm1
1253; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1254; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1255; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1256; SSE-NEXT:    movaps 128(%rdi), %xmm1
1257; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1258; SSE-NEXT:    movaps 112(%rdi), %xmm15
1259; SSE-NEXT:    movaps %xmm15, %xmm0
1260; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
1261; SSE-NEXT:    movaps 96(%rdi), %xmm1
1262; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1263; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1264; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1265; SSE-NEXT:    movaps 320(%rdi), %xmm1
1266; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1267; SSE-NEXT:    movaps 304(%rdi), %xmm6
1268; SSE-NEXT:    movaps %xmm6, %xmm0
1269; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
1270; SSE-NEXT:    movaps 288(%rdi), %xmm8
1271; SSE-NEXT:    movaps %xmm8, %xmm1
1272; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1273; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1274; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1275; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1276; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0]
1277; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3]
1278; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
1279; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1280; SSE-NEXT:    movaps %xmm7, %xmm14
1281; SSE-NEXT:    movaps %xmm9, %xmm0
1282; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1283; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0]
1284; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3]
1285; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2]
1286; SSE-NEXT:    movaps %xmm11, %xmm10
1287; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1288; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[1,0],xmm0[0,0]
1289; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3]
1290; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2]
1291; SSE-NEXT:    movaps %xmm3, %xmm9
1292; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1293; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0]
1294; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1295; SSE-NEXT:    # xmm0 = xmm0[3,1],mem[2,3]
1296; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2]
1297; SSE-NEXT:    movaps %xmm4, %xmm7
1298; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1299; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm0[0,0]
1300; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1301; SSE-NEXT:    # xmm0 = xmm0[3,1],mem[2,3]
1302; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2]
1303; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1304; SSE-NEXT:    movaps %xmm4, %xmm11
1305; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,0],xmm15[0,0]
1306; SSE-NEXT:    movaps %xmm15, %xmm0
1307; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1308; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3]
1309; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
1310; SSE-NEXT:    movaps %xmm8, %xmm3
1311; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1312; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0]
1313; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1314; SSE-NEXT:    # xmm6 = xmm6[3,1],mem[2,3]
1315; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
1316; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1317; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1318; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1319; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
1320; SSE-NEXT:    movaps %xmm0, %xmm13
1321; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1322; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[3,1],xmm1[2,3]
1323; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,2]
1324; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1325; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1326; SSE-NEXT:    # xmm13 = mem[1,1,1,1]
1327; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1328; SSE-NEXT:    # xmm8 = mem[2,3,2,3]
1329; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1]
1330; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1331; SSE-NEXT:    # xmm8 = xmm8[0,1],mem[0,3]
1332; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1]
1333; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1]
1334; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,3]
1335; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1336; SSE-NEXT:    # xmm13 = mem[1,1,1,1]
1337; SSE-NEXT:    pshufd $238, (%rsp), %xmm6 # 16-byte Folded Reload
1338; SSE-NEXT:    # xmm6 = mem[2,3,2,3]
1339; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1]
1340; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1341; SSE-NEXT:    # xmm6 = xmm6[0,1],mem[0,3]
1342; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm15[1,1,1,1]
1343; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
1344; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
1345; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm12[0,3]
1346; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1347; SSE-NEXT:    # xmm15 = mem[1,1,1,1]
1348; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1349; SSE-NEXT:    # xmm13 = mem[2,3,2,3]
1350; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
1351; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1352; SSE-NEXT:    # xmm13 = xmm13[0,1],mem[0,3]
1353; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1354; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
1355; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1356; SSE-NEXT:    # xmm15 = mem[2,3,2,3]
1357; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
1358; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1359; SSE-NEXT:    # xmm15 = xmm15[0,1],mem[0,3]
1360; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1361; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
1362; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1363; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
1364; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1365; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1366; SSE-NEXT:    # xmm1 = xmm1[0,1],mem[0,3]
1367; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1368; SSE-NEXT:    # xmm2 = mem[1,1,1,1]
1369; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1370; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
1371; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1372; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1373; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,3]
1374; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1375; SSE-NEXT:    movaps %xmm2, 96(%rsi)
1376; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1377; SSE-NEXT:    movaps %xmm2, 32(%rsi)
1378; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1379; SSE-NEXT:    movaps %xmm2, 112(%rsi)
1380; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1381; SSE-NEXT:    movaps %xmm2, 48(%rsi)
1382; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1383; SSE-NEXT:    movaps %xmm2, 64(%rsi)
1384; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1385; SSE-NEXT:    movaps %xmm2, (%rsi)
1386; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1387; SSE-NEXT:    movaps %xmm2, 80(%rsi)
1388; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1389; SSE-NEXT:    movaps %xmm2, 16(%rsi)
1390; SSE-NEXT:    movaps %xmm3, 96(%rdx)
1391; SSE-NEXT:    movaps %xmm11, 32(%rdx)
1392; SSE-NEXT:    movaps %xmm7, 112(%rdx)
1393; SSE-NEXT:    movaps %xmm9, 48(%rdx)
1394; SSE-NEXT:    movaps %xmm10, 64(%rdx)
1395; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1396; SSE-NEXT:    movaps %xmm2, (%rdx)
1397; SSE-NEXT:    movaps %xmm14, 80(%rdx)
1398; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1399; SSE-NEXT:    movaps %xmm2, 16(%rdx)
1400; SSE-NEXT:    movaps %xmm0, 96(%rcx)
1401; SSE-NEXT:    movaps %xmm1, 112(%rcx)
1402; SSE-NEXT:    movaps %xmm15, 64(%rcx)
1403; SSE-NEXT:    movaps %xmm13, 80(%rcx)
1404; SSE-NEXT:    movaps %xmm4, 32(%rcx)
1405; SSE-NEXT:    movaps %xmm6, 48(%rcx)
1406; SSE-NEXT:    movaps %xmm5, (%rcx)
1407; SSE-NEXT:    movaps %xmm8, 16(%rcx)
1408; SSE-NEXT:    addq $392, %rsp # imm = 0x188
1409; SSE-NEXT:    retq
1410;
1411; AVX-LABEL: load_i32_stride3_vf32:
1412; AVX:       # %bb.0:
1413; AVX-NEXT:    subq $392, %rsp # imm = 0x188
1414; AVX-NEXT:    vmovaps 256(%rdi), %ymm2
1415; AVX-NEXT:    vmovaps 224(%rdi), %ymm7
1416; AVX-NEXT:    vmovaps 192(%rdi), %ymm3
1417; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1418; AVX-NEXT:    vmovaps 352(%rdi), %ymm4
1419; AVX-NEXT:    vmovaps 320(%rdi), %ymm5
1420; AVX-NEXT:    vmovaps 288(%rdi), %ymm6
1421; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1422; AVX-NEXT:    vmovaps 160(%rdi), %ymm10
1423; AVX-NEXT:    vmovaps 128(%rdi), %ymm9
1424; AVX-NEXT:    vmovaps 96(%rdi), %ymm0
1425; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1426; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
1427; AVX-NEXT:    vmovaps 112(%rdi), %xmm1
1428; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
1429; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
1430; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1]
1431; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
1432; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1433; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
1434; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1435; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1436; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7]
1437; AVX-NEXT:    vmovaps 304(%rdi), %xmm1
1438; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7]
1439; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
1440; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1]
1441; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4]
1442; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1443; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
1444; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1445; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1446; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1447; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7]
1448; AVX-NEXT:    vmovaps 208(%rdi), %xmm1
1449; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7]
1450; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
1451; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1452; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1]
1453; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
1454; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
1455; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1456; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1457; AVX-NEXT:    vmovaps 32(%rdi), %ymm15
1458; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
1459; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7]
1460; AVX-NEXT:    vmovaps (%rdi), %ymm2
1461; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7]
1462; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1463; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
1464; AVX-NEXT:    vmovaps 64(%rdi), %ymm7
1465; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1]
1466; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4]
1467; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
1468; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7]
1469; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1470; AVX-NEXT:    vmovups %ymm11, (%rsp) # 32-byte Spill
1471; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4]
1472; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4]
1473; AVX-NEXT:    vmovaps 112(%rdi), %xmm13
1474; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
1475; AVX-NEXT:    # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
1476; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7]
1477; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
1478; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7]
1479; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1480; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4]
1481; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4]
1482; AVX-NEXT:    vmovaps 304(%rdi), %xmm8
1483; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1484; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
1485; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7]
1486; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5]
1487; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7]
1488; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1489; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4]
1490; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4]
1491; AVX-NEXT:    vmovaps 16(%rdi), %xmm11
1492; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7]
1493; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7]
1494; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
1495; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7]
1496; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1497; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1498; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4]
1499; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4]
1500; AVX-NEXT:    vmovaps 208(%rdi), %xmm10
1501; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1502; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1503; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1504; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7]
1505; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
1506; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7]
1507; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
1508; AVX-NEXT:    # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7]
1509; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4]
1510; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7]
1511; AVX-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
1512; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
1513; AVX-NEXT:    # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7]
1514; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7]
1515; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
1516; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
1517; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7]
1518; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
1519; AVX-NEXT:    # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7]
1520; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7]
1521; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
1522; AVX-NEXT:    # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
1523; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4]
1524; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7]
1525; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7]
1526; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
1527; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1528; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4]
1529; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7]
1530; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
1531; AVX-NEXT:    # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7]
1532; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
1533; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1534; AVX-NEXT:    vmovaps %ymm2, (%rsi)
1535; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1536; AVX-NEXT:    vmovaps %ymm2, 64(%rsi)
1537; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1538; AVX-NEXT:    vmovaps %ymm2, 96(%rsi)
1539; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1540; AVX-NEXT:    vmovaps %ymm2, 32(%rsi)
1541; AVX-NEXT:    vmovaps %ymm4, 64(%rdx)
1542; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1543; AVX-NEXT:    vmovaps %ymm2, (%rdx)
1544; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1545; AVX-NEXT:    vmovaps %ymm2, 96(%rdx)
1546; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1547; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)
1548; AVX-NEXT:    vmovaps %ymm1, 64(%rcx)
1549; AVX-NEXT:    vmovaps %ymm0, (%rcx)
1550; AVX-NEXT:    vmovaps %ymm5, 96(%rcx)
1551; AVX-NEXT:    vmovaps %ymm6, 32(%rcx)
1552; AVX-NEXT:    addq $392, %rsp # imm = 0x188
1553; AVX-NEXT:    vzeroupper
1554; AVX-NEXT:    retq
1555;
1556; AVX2-LABEL: load_i32_stride3_vf32:
1557; AVX2:       # %bb.0:
1558; AVX2-NEXT:    subq $40, %rsp
1559; AVX2-NEXT:    vmovaps 256(%rdi), %ymm15
1560; AVX2-NEXT:    vmovaps 224(%rdi), %ymm5
1561; AVX2-NEXT:    vmovaps 192(%rdi), %ymm3
1562; AVX2-NEXT:    vmovaps 352(%rdi), %ymm4
1563; AVX2-NEXT:    vmovaps 320(%rdi), %ymm8
1564; AVX2-NEXT:    vmovaps 288(%rdi), %ymm10
1565; AVX2-NEXT:    vmovaps 160(%rdi), %ymm13
1566; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
1567; AVX2-NEXT:    vmovaps (%rdi), %ymm6
1568; AVX2-NEXT:    vmovaps 32(%rdi), %ymm9
1569; AVX2-NEXT:    vmovaps 64(%rdi), %ymm7
1570; AVX2-NEXT:    vmovaps 96(%rdi), %ymm0
1571; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1572; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
1573; AVX2-NEXT:    vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
1574; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm2
1575; AVX2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
1576; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6],ymm8[7]
1577; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
1578; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm2
1579; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1580; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
1581; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
1582; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm2
1583; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1584; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
1585; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1586; AVX2-NEXT:    vpermps %ymm14, %ymm12, %ymm2
1587; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1588; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1589; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
1590; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,0,3,6]
1591; AVX2-NEXT:    vpermps %ymm12, %ymm2, %ymm11
1592; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1593; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
1594; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0],ymm11[1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
1595; AVX2-NEXT:    vpermps %ymm11, %ymm2, %ymm11
1596; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
1597; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
1598; AVX2-NEXT:    vpermps %ymm12, %ymm2, %ymm12
1599; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
1600; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
1601; AVX2-NEXT:    vpermps %ymm14, %ymm2, %ymm2
1602; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1603; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
1604; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1605; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,0,3,4,5,4,7]
1606; AVX2-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
1607; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
1608; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7]
1609; AVX2-NEXT:    vpermps %ymm8, %ymm1, %ymm8
1610; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
1611; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
1612; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
1613; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
1614; AVX2-NEXT:    vpermps %ymm6, %ymm1, %ymm6
1615; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
1616; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
1617; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
1618; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
1619; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm1
1620; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[0,1,0,3,4,5,4,7]
1621; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
1622; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
1623; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1624; AVX2-NEXT:    vmovaps %ymm3, 64(%rsi)
1625; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1626; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
1627; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1628; AVX2-NEXT:    vmovaps %ymm3, 96(%rsi)
1629; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
1630; AVX2-NEXT:    vmovaps %ymm3, 32(%rsi)
1631; AVX2-NEXT:    vmovaps %ymm2, 64(%rdx)
1632; AVX2-NEXT:    vmovaps %ymm12, (%rdx)
1633; AVX2-NEXT:    vmovaps %ymm11, 96(%rdx)
1634; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1635; AVX2-NEXT:    vmovaps %ymm2, 32(%rdx)
1636; AVX2-NEXT:    vmovaps %ymm1, 64(%rcx)
1637; AVX2-NEXT:    vmovaps %ymm6, (%rcx)
1638; AVX2-NEXT:    vmovaps %ymm4, 96(%rcx)
1639; AVX2-NEXT:    vmovaps %ymm0, 32(%rcx)
1640; AVX2-NEXT:    addq $40, %rsp
1641; AVX2-NEXT:    vzeroupper
1642; AVX2-NEXT:    retq
1643;
1644; AVX2-FP-LABEL: load_i32_stride3_vf32:
1645; AVX2-FP:       # %bb.0:
1646; AVX2-FP-NEXT:    subq $40, %rsp
1647; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm15
1648; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm5
1649; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm3
1650; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm4
1651; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm8
1652; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm10
1653; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm13
1654; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
1655; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm6
1656; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm9
1657; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm7
1658; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm0
1659; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1660; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
1661; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
1662; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm2
1663; AVX2-FP-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
1664; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6],ymm8[7]
1665; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
1666; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm2
1667; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1668; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
1669; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
1670; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm2
1671; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1672; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
1673; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1674; AVX2-FP-NEXT:    vpermps %ymm14, %ymm12, %ymm2
1675; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1676; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1677; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
1678; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,0,3,6]
1679; AVX2-FP-NEXT:    vpermps %ymm12, %ymm2, %ymm11
1680; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1681; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
1682; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0],ymm11[1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
1683; AVX2-FP-NEXT:    vpermps %ymm11, %ymm2, %ymm11
1684; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
1685; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
1686; AVX2-FP-NEXT:    vpermps %ymm12, %ymm2, %ymm12
1687; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
1688; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
1689; AVX2-FP-NEXT:    vpermps %ymm14, %ymm2, %ymm2
1690; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1691; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
1692; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1693; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,0,3,4,5,4,7]
1694; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
1695; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
1696; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7]
1697; AVX2-FP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
1698; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
1699; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
1700; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
1701; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
1702; AVX2-FP-NEXT:    vpermps %ymm6, %ymm1, %ymm6
1703; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
1704; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
1705; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
1706; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
1707; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
1708; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[0,1,0,3,4,5,4,7]
1709; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
1710; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
1711; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1712; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%rsi)
1713; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1714; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
1715; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1716; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%rsi)
1717; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
1718; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rsi)
1719; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rdx)
1720; AVX2-FP-NEXT:    vmovaps %ymm12, (%rdx)
1721; AVX2-FP-NEXT:    vmovaps %ymm11, 96(%rdx)
1722; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1723; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rdx)
1724; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rcx)
1725; AVX2-FP-NEXT:    vmovaps %ymm6, (%rcx)
1726; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rcx)
1727; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rcx)
1728; AVX2-FP-NEXT:    addq $40, %rsp
1729; AVX2-FP-NEXT:    vzeroupper
1730; AVX2-FP-NEXT:    retq
1731;
1732; AVX2-FCP-LABEL: load_i32_stride3_vf32:
1733; AVX2-FCP:       # %bb.0:
1734; AVX2-FCP-NEXT:    subq $72, %rsp
1735; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm0
1736; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm2
1737; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1738; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
1739; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm4
1740; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm6
1741; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm7
1742; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm13
1743; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm14
1744; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm8
1745; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm9
1746; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm10
1747; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm15
1748; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6],ymm14[7]
1749; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7]
1750; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
1751; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm3
1752; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1753; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
1754; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1755; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm12, %ymm3
1756; AVX2-FCP-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
1757; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7]
1758; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7]
1759; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm3
1760; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1761; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1762; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
1763; AVX2-FCP-NEXT:    vmovaps %ymm0, %ymm2
1764; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm0
1765; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1766; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
1767; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
1768; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,4,7,2,5,0,3,6]
1769; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm3, %ymm5
1770; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1771; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
1772; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1773; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm3, %ymm5
1774; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
1775; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
1776; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm3, %ymm11
1777; AVX2-FCP-NEXT:    vmovaps %ymm1, %ymm0
1778; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1779; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1780; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
1781; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm3, %ymm3
1782; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1783; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7]
1784; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
1785; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
1786; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
1787; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7]
1788; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1789; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1790; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,0,3,6,1,4,7]
1791; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm2, %ymm7
1792; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm4
1793; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm2, %ymm6
1794; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
1795; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1796; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rsi)
1797; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1798; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rsi)
1799; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm2 # 32-byte Reload
1800; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rsi)
1801; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1802; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rsi)
1803; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%rdx)
1804; AVX2-FCP-NEXT:    vmovaps %ymm11, (%rdx)
1805; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rdx)
1806; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1807; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rdx)
1808; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rcx)
1809; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rcx)
1810; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rcx)
1811; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rcx)
1812; AVX2-FCP-NEXT:    addq $72, %rsp
1813; AVX2-FCP-NEXT:    vzeroupper
1814; AVX2-FCP-NEXT:    retq
1815;
1816; AVX512-LABEL: load_i32_stride3_vf32:
1817; AVX512:       # %bb.0:
1818; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm0
1819; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm1
1820; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm2
1821; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm3
1822; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm4
1823; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm5
1824; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1825; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7
1826; AVX512-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
1827; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1828; AVX512-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
1829; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1830; AVX512-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
1831; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1832; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm9
1833; AVX512-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
1834; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1835; AVX512-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
1836; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
1837; AVX512-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
1838; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1839; AVX512-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
1840; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1841; AVX512-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
1842; AVX512-NEXT:    vpermt2d %zmm3, %zmm10, %zmm2
1843; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm2
1844; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
1845; AVX512-NEXT:    vmovdqa64 %zmm6, (%rsi)
1846; AVX512-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
1847; AVX512-NEXT:    vmovdqa64 %zmm8, (%rdx)
1848; AVX512-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
1849; AVX512-NEXT:    vmovdqa64 %zmm2, (%rcx)
1850; AVX512-NEXT:    vzeroupper
1851; AVX512-NEXT:    retq
1852;
1853; AVX512-FCP-LABEL: load_i32_stride3_vf32:
1854; AVX512-FCP:       # %bb.0:
1855; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
1856; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
1857; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
1858; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
1859; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
1860; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
1861; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1862; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
1863; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
1864; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1865; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
1866; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1867; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
1868; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1869; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
1870; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
1871; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1872; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
1873; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
1874; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
1875; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1876; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
1877; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1878; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
1879; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm10, %zmm2
1880; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm2
1881; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
1882; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%rsi)
1883; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
1884; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
1885; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
1886; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
1887; AVX512-FCP-NEXT:    vzeroupper
1888; AVX512-FCP-NEXT:    retq
1889;
1890; AVX512DQ-LABEL: load_i32_stride3_vf32:
1891; AVX512DQ:       # %bb.0:
1892; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm0
1893; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm1
1894; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm2
1895; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm3
1896; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm4
1897; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm5
1898; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1899; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm7
1900; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
1901; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1902; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
1903; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1904; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
1905; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1906; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm9
1907; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
1908; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1909; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
1910; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
1911; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
1912; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1913; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
1914; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1915; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
1916; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm10, %zmm2
1917; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm2
1918; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
1919; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%rsi)
1920; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
1921; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rdx)
1922; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
1923; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rcx)
1924; AVX512DQ-NEXT:    vzeroupper
1925; AVX512DQ-NEXT:    retq
1926;
1927; AVX512DQ-FCP-LABEL: load_i32_stride3_vf32:
1928; AVX512DQ-FCP:       # %bb.0:
1929; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
1930; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
1931; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
1932; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
1933; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
1934; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
1935; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1936; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
1937; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
1938; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1939; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
1940; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1941; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
1942; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1943; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
1944; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
1945; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1946; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
1947; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
1948; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
1949; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1950; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
1951; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1952; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
1953; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm10, %zmm2
1954; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm2
1955; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
1956; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rsi)
1957; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
1958; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
1959; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
1960; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
1961; AVX512DQ-FCP-NEXT:    vzeroupper
1962; AVX512DQ-FCP-NEXT:    retq
1963;
1964; AVX512BW-LABEL: load_i32_stride3_vf32:
1965; AVX512BW:       # %bb.0:
1966; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
1967; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
1968; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
1969; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
1970; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
1971; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
1972; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1973; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm7
1974; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
1975; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1976; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
1977; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
1978; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
1979; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1980; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
1981; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
1982; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1983; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
1984; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
1985; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
1986; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1987; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
1988; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1989; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
1990; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm10, %zmm2
1991; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm2
1992; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
1993; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rsi)
1994; AVX512BW-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
1995; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rdx)
1996; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
1997; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rcx)
1998; AVX512BW-NEXT:    vzeroupper
1999; AVX512BW-NEXT:    retq
2000;
2001; AVX512BW-FCP-LABEL: load_i32_stride3_vf32:
2002; AVX512BW-FCP:       # %bb.0:
2003; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
2004; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
2005; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
2006; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
2007; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
2008; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
2009; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
2010; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
2011; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
2012; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
2013; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
2014; AVX512BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
2015; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
2016; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
2017; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
2018; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
2019; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
2020; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
2021; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
2022; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
2023; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
2024; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
2025; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
2026; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
2027; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm10, %zmm2
2028; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm2
2029; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
2030; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rsi)
2031; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
2032; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
2033; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
2034; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
2035; AVX512BW-FCP-NEXT:    vzeroupper
2036; AVX512BW-FCP-NEXT:    retq
2037;
2038; AVX512DQ-BW-LABEL: load_i32_stride3_vf32:
2039; AVX512DQ-BW:       # %bb.0:
2040; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
2041; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
2042; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm2
2043; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
2044; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
2045; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
2046; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
2047; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm7
2048; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
2049; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
2050; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
2051; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
2052; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
2053; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
2054; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm9
2055; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
2056; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
2057; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
2058; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
2059; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
2060; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
2061; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
2062; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
2063; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
2064; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm10, %zmm2
2065; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm2
2066; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
2067; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, (%rsi)
2068; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
2069; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, (%rdx)
2070; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
2071; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rcx)
2072; AVX512DQ-BW-NEXT:    vzeroupper
2073; AVX512DQ-BW-NEXT:    retq
2074;
2075; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf32:
2076; AVX512DQ-BW-FCP:       # %bb.0:
2077; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
2078; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
2079; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
2080; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
2081; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
2082; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
2083; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
2084; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
2085; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
2086; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
2087; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
2088; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
2089; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
2090; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
2091; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
2092; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
2093; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
2094; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
2095; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
2096; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
2097; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
2098; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
2099; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
2100; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
2101; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm10, %zmm2
2102; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm1, %zmm2
2103; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
2104; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rsi)
2105; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
2106; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
2107; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
2108; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
2109; AVX512DQ-BW-FCP-NEXT:    vzeroupper
2110; AVX512DQ-BW-FCP-NEXT:    retq
2111  %wide.vec = load <96 x i32>, ptr %in.vec, align 64
2112  %strided.vec0 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>
2113  %strided.vec1 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94>
2114  %strided.vec2 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95>
2115  store <32 x i32> %strided.vec0, ptr %out.vec0, align 64
2116  store <32 x i32> %strided.vec1, ptr %out.vec1, align 64
2117  store <32 x i32> %strided.vec2, ptr %out.vec2, align 64
2118  ret void
2119}
2120
2121define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
2122; SSE-LABEL: load_i32_stride3_vf64:
2123; SSE:       # %bb.0:
2124; SSE-NEXT:    subq $1112, %rsp # imm = 0x458
2125; SSE-NEXT:    movaps 624(%rdi), %xmm2
2126; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2127; SSE-NEXT:    movaps 656(%rdi), %xmm4
2128; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2129; SSE-NEXT:    movaps 640(%rdi), %xmm10
2130; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2131; SSE-NEXT:    movaps 432(%rdi), %xmm6
2132; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2133; SSE-NEXT:    movaps 464(%rdi), %xmm5
2134; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2135; SSE-NEXT:    movaps 448(%rdi), %xmm11
2136; SSE-NEXT:    movaps %xmm11, (%rsp) # 16-byte Spill
2137; SSE-NEXT:    movaps 240(%rdi), %xmm7
2138; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2139; SSE-NEXT:    movaps 272(%rdi), %xmm3
2140; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2141; SSE-NEXT:    movaps 256(%rdi), %xmm13
2142; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2143; SSE-NEXT:    movaps 48(%rdi), %xmm9
2144; SSE-NEXT:    movaps 80(%rdi), %xmm1
2145; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2146; SSE-NEXT:    movaps 64(%rdi), %xmm12
2147; SSE-NEXT:    movaps %xmm12, %xmm0
2148; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2149; SSE-NEXT:    movaps %xmm9, %xmm1
2150; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2151; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2152; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2153; SSE-NEXT:    movaps %xmm13, %xmm0
2154; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
2155; SSE-NEXT:    movaps %xmm7, %xmm1
2156; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2157; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2158; SSE-NEXT:    movaps %xmm11, %xmm0
2159; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0]
2160; SSE-NEXT:    movaps %xmm6, %xmm1
2161; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2162; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2163; SSE-NEXT:    movaps %xmm10, %xmm0
2164; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0]
2165; SSE-NEXT:    movaps %xmm2, %xmm1
2166; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2167; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2168; SSE-NEXT:    movaps 16(%rdi), %xmm0
2169; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2170; SSE-NEXT:    movaps 32(%rdi), %xmm1
2171; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2172; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2173; SSE-NEXT:    movaps (%rdi), %xmm1
2174; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2175; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2176; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2177; SSE-NEXT:    movaps 224(%rdi), %xmm1
2178; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2179; SSE-NEXT:    movaps 208(%rdi), %xmm0
2180; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2181; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2182; SSE-NEXT:    movaps 192(%rdi), %xmm1
2183; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2184; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2185; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2186; SSE-NEXT:    movaps 416(%rdi), %xmm1
2187; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2188; SSE-NEXT:    movaps 400(%rdi), %xmm0
2189; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2190; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2191; SSE-NEXT:    movaps 384(%rdi), %xmm1
2192; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2193; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2194; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2195; SSE-NEXT:    movaps 608(%rdi), %xmm1
2196; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2197; SSE-NEXT:    movaps 592(%rdi), %xmm0
2198; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2199; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2200; SSE-NEXT:    movaps 576(%rdi), %xmm1
2201; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2202; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2203; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2204; SSE-NEXT:    movaps 176(%rdi), %xmm10
2205; SSE-NEXT:    movaps 160(%rdi), %xmm8
2206; SSE-NEXT:    movaps %xmm8, %xmm0
2207; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0]
2208; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2209; SSE-NEXT:    movaps 144(%rdi), %xmm2
2210; SSE-NEXT:    movaps %xmm2, %xmm1
2211; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2212; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2213; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2214; SSE-NEXT:    movaps 368(%rdi), %xmm1
2215; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2216; SSE-NEXT:    movaps 352(%rdi), %xmm15
2217; SSE-NEXT:    movaps %xmm15, %xmm0
2218; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2219; SSE-NEXT:    movaps 336(%rdi), %xmm14
2220; SSE-NEXT:    movaps %xmm14, %xmm1
2221; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2222; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2223; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2224; SSE-NEXT:    movaps 560(%rdi), %xmm1
2225; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2226; SSE-NEXT:    movaps 544(%rdi), %xmm0
2227; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2228; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2229; SSE-NEXT:    movaps 528(%rdi), %xmm1
2230; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2231; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2232; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2233; SSE-NEXT:    movaps 752(%rdi), %xmm1
2234; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2235; SSE-NEXT:    movaps 736(%rdi), %xmm0
2236; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2237; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2238; SSE-NEXT:    movaps 720(%rdi), %xmm1
2239; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2240; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2241; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2242; SSE-NEXT:    movaps 128(%rdi), %xmm6
2243; SSE-NEXT:    movaps 112(%rdi), %xmm4
2244; SSE-NEXT:    movaps %xmm4, %xmm1
2245; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[1,0]
2246; SSE-NEXT:    movaps 96(%rdi), %xmm3
2247; SSE-NEXT:    movaps %xmm3, %xmm7
2248; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2249; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2]
2250; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2251; SSE-NEXT:    movaps 320(%rdi), %xmm13
2252; SSE-NEXT:    movaps 304(%rdi), %xmm11
2253; SSE-NEXT:    movaps %xmm11, %xmm1
2254; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[1,0]
2255; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2256; SSE-NEXT:    movaps 288(%rdi), %xmm5
2257; SSE-NEXT:    movaps %xmm5, %xmm7
2258; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2259; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2]
2260; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2261; SSE-NEXT:    movaps 512(%rdi), %xmm0
2262; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2263; SSE-NEXT:    movaps 496(%rdi), %xmm1
2264; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2265; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2266; SSE-NEXT:    movaps 480(%rdi), %xmm7
2267; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2268; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2]
2269; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2270; SSE-NEXT:    movaps 704(%rdi), %xmm7
2271; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2272; SSE-NEXT:    movaps 688(%rdi), %xmm1
2273; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2274; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[1,0]
2275; SSE-NEXT:    movaps 672(%rdi), %xmm7
2276; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2277; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2]
2278; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2279; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2280; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0]
2281; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2282; SSE-NEXT:    # xmm12 = xmm12[3,1],mem[2,3]
2283; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2]
2284; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2285; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2286; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0]
2287; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,1],xmm10[2,3]
2288; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2]
2289; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2290; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2291; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0]
2292; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[2,3]
2293; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
2294; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2295; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2296; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2297; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0]
2298; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2299; SSE-NEXT:    # xmm1 = xmm1[3,1],mem[2,3]
2300; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2]
2301; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2302; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2303; SSE-NEXT:    movaps %xmm9, %xmm0
2304; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2305; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0]
2306; SSE-NEXT:    movaps %xmm3, %xmm1
2307; SSE-NEXT:    movaps %xmm3, %xmm12
2308; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2309; SSE-NEXT:    # xmm1 = xmm1[3,1],mem[2,3]
2310; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2311; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2312; SSE-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2313; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0]
2314; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
2315; SSE-NEXT:    # xmm15 = xmm15[3,1],mem[2,3]
2316; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,2],xmm15[0,2]
2317; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2318; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2319; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0]
2320; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[3,1],xmm13[2,3]
2321; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm11[0,2]
2322; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2323; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2324; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
2325; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2326; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2327; SSE-NEXT:    # xmm1 = xmm1[3,1],mem[2,3]
2328; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2329; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2330; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2331; SSE-NEXT:    movaps %xmm7, %xmm0
2332; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2333; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0]
2334; SSE-NEXT:    movaps %xmm4, %xmm1
2335; SSE-NEXT:    movaps %xmm4, %xmm8
2336; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2337; SSE-NEXT:    # xmm1 = xmm1[3,1],mem[2,3]
2338; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2339; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2340; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2341; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2342; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2343; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2344; SSE-NEXT:    # xmm1 = xmm1[3,1],mem[2,3]
2345; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2346; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2347; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2348; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2349; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2350; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2351; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3]
2352; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2353; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2354; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2355; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2356; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2357; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2358; SSE-NEXT:    # xmm1 = xmm1[3,1],mem[2,3]
2359; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2360; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2361; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2362; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2363; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[0,0]
2364; SSE-NEXT:    movaps %xmm14, %xmm1
2365; SSE-NEXT:    movaps %xmm14, %xmm3
2366; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2367; SSE-NEXT:    # xmm1 = xmm1[3,1],mem[2,3]
2368; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2369; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2370; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2371; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2372; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2373; SSE-NEXT:    shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2374; SSE-NEXT:    # xmm1 = xmm1[3,1],mem[2,3]
2375; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2376; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2377; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2378; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2379; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[0,0]
2380; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2381; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3]
2382; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,2],xmm1[0,2]
2383; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2384; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
2385; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2386; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
2387; SSE-NEXT:    movaps %xmm2, %xmm11
2388; SSE-NEXT:    movaps %xmm0, %xmm1
2389; SSE-NEXT:    movaps %xmm0, %xmm2
2390; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2391; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2392; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2]
2393; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2394; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
2395; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
2396; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,3]
2397; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2398; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2399; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
2400; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2401; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
2402; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2403; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2404; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,3]
2405; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2406; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2407; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2408; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
2409; SSE-NEXT:    # xmm15 = mem[2,3,2,3]
2410; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
2411; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[0,3]
2412; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2413; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2414; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2415; SSE-NEXT:    # xmm13 = mem[2,3,2,3]
2416; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
2417; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2418; SSE-NEXT:    # xmm13 = xmm13[0,1],mem[0,3]
2419; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1]
2420; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm9[2,3,2,3]
2421; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
2422; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2423; SSE-NEXT:    # xmm12 = xmm12[0,1],mem[0,3]
2424; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2425; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2426; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2427; SSE-NEXT:    # xmm11 = mem[2,3,2,3]
2428; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
2429; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2430; SSE-NEXT:    # xmm11 = xmm11[0,1],mem[0,3]
2431; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2432; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2433; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2434; SSE-NEXT:    # xmm10 = mem[2,3,2,3]
2435; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
2436; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2437; SSE-NEXT:    # xmm10 = xmm10[0,1],mem[0,3]
2438; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2439; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2440; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2441; SSE-NEXT:    # xmm9 = mem[2,3,2,3]
2442; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
2443; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2444; SSE-NEXT:    # xmm9 = xmm9[0,1],mem[0,3]
2445; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2446; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
2447; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
2448; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2449; SSE-NEXT:    # xmm8 = xmm8[0,1],mem[0,3]
2450; SSE-NEXT:    pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload
2451; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2452; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2453; SSE-NEXT:    # xmm7 = mem[2,3,2,3]
2454; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2455; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2456; SSE-NEXT:    # xmm7 = xmm7[0,1],mem[0,3]
2457; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2458; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2459; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2460; SSE-NEXT:    # xmm6 = mem[2,3,2,3]
2461; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
2462; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3]
2463; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2464; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2465; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2466; SSE-NEXT:    # xmm5 = mem[2,3,2,3]
2467; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2468; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2469; SSE-NEXT:    # xmm5 = xmm5[0,1],mem[0,3]
2470; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
2471; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2472; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
2473; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2474; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2475; SSE-NEXT:    # xmm3 = xmm3[0,1],mem[0,3]
2476; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2477; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2478; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2479; SSE-NEXT:    # xmm2 = mem[2,3,2,3]
2480; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2481; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2482; SSE-NEXT:    # xmm2 = xmm2[0,1],mem[0,3]
2483; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2484; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2485; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2486; SSE-NEXT:    # xmm1 = mem[2,3,2,3]
2487; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2488; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3]
2489; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2490; SSE-NEXT:    # xmm4 = mem[1,1,1,1]
2491; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2492; SSE-NEXT:    # xmm0 = mem[2,3,2,3]
2493; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2494; SSE-NEXT:    shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2495; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,3]
2496; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2497; SSE-NEXT:    movaps %xmm4, 224(%rsi)
2498; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2499; SSE-NEXT:    movaps %xmm4, 160(%rsi)
2500; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2501; SSE-NEXT:    movaps %xmm4, 96(%rsi)
2502; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2503; SSE-NEXT:    movaps %xmm4, 32(%rsi)
2504; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2505; SSE-NEXT:    movaps %xmm4, 240(%rsi)
2506; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2507; SSE-NEXT:    movaps %xmm4, 176(%rsi)
2508; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2509; SSE-NEXT:    movaps %xmm4, 112(%rsi)
2510; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2511; SSE-NEXT:    movaps %xmm4, 48(%rsi)
2512; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2513; SSE-NEXT:    movaps %xmm4, 192(%rsi)
2514; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2515; SSE-NEXT:    movaps %xmm4, 128(%rsi)
2516; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2517; SSE-NEXT:    movaps %xmm4, 64(%rsi)
2518; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2519; SSE-NEXT:    movaps %xmm4, (%rsi)
2520; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2521; SSE-NEXT:    movaps %xmm4, 208(%rsi)
2522; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2523; SSE-NEXT:    movaps %xmm4, 144(%rsi)
2524; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2525; SSE-NEXT:    movaps %xmm4, 80(%rsi)
2526; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2527; SSE-NEXT:    movaps %xmm4, 16(%rsi)
2528; SSE-NEXT:    movaps %xmm14, 224(%rdx)
2529; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2530; SSE-NEXT:    movaps %xmm4, 240(%rdx)
2531; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2532; SSE-NEXT:    movaps %xmm4, 192(%rdx)
2533; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2534; SSE-NEXT:    movaps %xmm4, 208(%rdx)
2535; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2536; SSE-NEXT:    movaps %xmm4, 160(%rdx)
2537; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2538; SSE-NEXT:    movaps %xmm4, 176(%rdx)
2539; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2540; SSE-NEXT:    movaps %xmm4, 128(%rdx)
2541; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2542; SSE-NEXT:    movaps %xmm4, 144(%rdx)
2543; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2544; SSE-NEXT:    movaps %xmm4, 96(%rdx)
2545; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2546; SSE-NEXT:    movaps %xmm4, 112(%rdx)
2547; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2548; SSE-NEXT:    movaps %xmm4, 64(%rdx)
2549; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2550; SSE-NEXT:    movaps %xmm4, 80(%rdx)
2551; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2552; SSE-NEXT:    movaps %xmm4, 32(%rdx)
2553; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2554; SSE-NEXT:    movaps %xmm4, 48(%rdx)
2555; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2556; SSE-NEXT:    movaps %xmm4, (%rdx)
2557; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2558; SSE-NEXT:    movaps %xmm4, 16(%rdx)
2559; SSE-NEXT:    movaps %xmm0, 240(%rcx)
2560; SSE-NEXT:    movaps %xmm1, 224(%rcx)
2561; SSE-NEXT:    movaps %xmm2, 208(%rcx)
2562; SSE-NEXT:    movaps %xmm3, 192(%rcx)
2563; SSE-NEXT:    movaps %xmm5, 176(%rcx)
2564; SSE-NEXT:    movaps %xmm6, 160(%rcx)
2565; SSE-NEXT:    movaps %xmm7, 144(%rcx)
2566; SSE-NEXT:    movaps %xmm8, 128(%rcx)
2567; SSE-NEXT:    movaps %xmm9, 112(%rcx)
2568; SSE-NEXT:    movaps %xmm10, 96(%rcx)
2569; SSE-NEXT:    movaps %xmm11, 80(%rcx)
2570; SSE-NEXT:    movaps %xmm12, 64(%rcx)
2571; SSE-NEXT:    movaps %xmm13, 48(%rcx)
2572; SSE-NEXT:    movaps %xmm15, 32(%rcx)
2573; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2574; SSE-NEXT:    movaps %xmm0, 16(%rcx)
2575; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2576; SSE-NEXT:    movaps %xmm0, (%rcx)
2577; SSE-NEXT:    addq $1112, %rsp # imm = 0x458
2578; SSE-NEXT:    retq
2579;
2580; AVX-LABEL: load_i32_stride3_vf64:
2581; AVX:       # %bb.0:
2582; AVX-NEXT:    subq $1384, %rsp # imm = 0x568
2583; AVX-NEXT:    vmovaps 544(%rdi), %ymm2
2584; AVX-NEXT:    vmovaps 512(%rdi), %ymm3
2585; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2586; AVX-NEXT:    vmovaps 480(%rdi), %ymm4
2587; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2588; AVX-NEXT:    vmovaps 352(%rdi), %ymm5
2589; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2590; AVX-NEXT:    vmovaps 320(%rdi), %ymm6
2591; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2592; AVX-NEXT:    vmovaps 288(%rdi), %ymm7
2593; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2594; AVX-NEXT:    vmovaps 160(%rdi), %ymm8
2595; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2596; AVX-NEXT:    vmovaps 128(%rdi), %ymm9
2597; AVX-NEXT:    vmovups %ymm9, (%rsp) # 32-byte Spill
2598; AVX-NEXT:    vmovaps 96(%rdi), %ymm0
2599; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2600; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
2601; AVX-NEXT:    vmovaps 112(%rdi), %xmm1
2602; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
2603; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
2604; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
2605; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2606; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4]
2607; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2608; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2609; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2610; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
2611; AVX-NEXT:    vmovaps 304(%rdi), %xmm1
2612; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7]
2613; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
2614; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1]
2615; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4]
2616; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2617; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2618; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2619; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
2620; AVX-NEXT:    vmovaps 496(%rdi), %xmm1
2621; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7]
2622; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
2623; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2624; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
2625; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4]
2626; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2627; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2628; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2629; AVX-NEXT:    vmovaps 704(%rdi), %ymm2
2630; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2631; AVX-NEXT:    vmovaps 688(%rdi), %xmm0
2632; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7]
2633; AVX-NEXT:    vmovaps 672(%rdi), %ymm1
2634; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2635; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
2636; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2637; AVX-NEXT:    vmovaps 736(%rdi), %ymm1
2638; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2639; AVX-NEXT:    vperm2f128 {{.*#+}} ymm15 = ymm1[2,3,0,1]
2640; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4]
2641; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2642; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2643; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2644; AVX-NEXT:    vmovaps 32(%rdi), %ymm7
2645; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
2646; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7]
2647; AVX-NEXT:    vmovaps (%rdi), %ymm1
2648; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2649; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7]
2650; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2651; AVX-NEXT:    vmovaps 64(%rdi), %ymm4
2652; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1]
2653; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2654; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
2655; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2656; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2657; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2658; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2659; AVX-NEXT:    vmovaps 224(%rdi), %ymm6
2660; AVX-NEXT:    vmovaps 208(%rdi), %xmm0
2661; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7]
2662; AVX-NEXT:    vmovaps 192(%rdi), %ymm1
2663; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2664; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7]
2665; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2666; AVX-NEXT:    vmovaps 256(%rdi), %ymm5
2667; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1]
2668; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2669; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4]
2670; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2671; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2672; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2673; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2674; AVX-NEXT:    vmovaps 416(%rdi), %ymm12
2675; AVX-NEXT:    vmovaps 400(%rdi), %xmm0
2676; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7]
2677; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
2678; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2679; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7]
2680; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2681; AVX-NEXT:    vmovaps 448(%rdi), %ymm8
2682; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
2683; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
2684; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2685; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2686; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2687; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2688; AVX-NEXT:    vmovaps 608(%rdi), %ymm10
2689; AVX-NEXT:    vmovaps 592(%rdi), %xmm0
2690; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7]
2691; AVX-NEXT:    vmovaps 576(%rdi), %ymm1
2692; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2693; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7]
2694; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2695; AVX-NEXT:    vmovaps 640(%rdi), %ymm13
2696; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm13[2,3,0,1]
2697; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm11[2,0],ymm13[5,4],ymm11[6,4]
2698; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2699; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2700; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2701; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2702; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2703; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2704; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
2705; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
2706; AVX-NEXT:    vmovaps 112(%rdi), %xmm0
2707; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2708; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
2709; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2710; AVX-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
2711; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7]
2712; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
2713; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7]
2714; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2715; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2716; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2717; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4]
2718; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4]
2719; AVX-NEXT:    vmovaps 304(%rdi), %xmm2
2720; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2721; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2722; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2723; AVX-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
2724; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
2725; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
2726; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
2727; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2728; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2729; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2730; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[3,0],ymm0[6,4],ymm3[7,4]
2731; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[2,0],ymm3[4,4],ymm0[6,4]
2732; AVX-NEXT:    vmovaps 496(%rdi), %xmm3
2733; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2734; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2735; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2736; AVX-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
2737; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7]
2738; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
2739; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
2740; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2741; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2742; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2743; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4]
2744; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4]
2745; AVX-NEXT:    vmovaps 688(%rdi), %xmm3
2746; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2747; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2748; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2749; AVX-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
2750; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7]
2751; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
2752; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
2753; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2754; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2755; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4]
2756; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
2757; AVX-NEXT:    vmovaps 16(%rdi), %xmm4
2758; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2759; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7]
2760; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7]
2761; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
2762; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
2763; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2764; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2765; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4]
2766; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4]
2767; AVX-NEXT:    vmovaps 208(%rdi), %xmm5
2768; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2769; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
2770; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7]
2771; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5]
2772; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
2773; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2774; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2775; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4]
2776; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4]
2777; AVX-NEXT:    vmovaps 400(%rdi), %xmm8
2778; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2779; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7]
2780; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7]
2781; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5]
2782; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7]
2783; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2784; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4]
2785; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4]
2786; AVX-NEXT:    vmovaps 592(%rdi), %xmm9
2787; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2788; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7]
2789; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7]
2790; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
2791; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
2792; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2793; AVX-NEXT:    vmovups (%rsp), %ymm15 # 32-byte Reload
2794; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
2795; AVX-NEXT:    # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
2796; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
2797; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4]
2798; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[0,3],ymm14[6,4],ymm15[4,7]
2799; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
2800; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
2801; AVX-NEXT:    # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7]
2802; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
2803; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7]
2804; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4]
2805; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7]
2806; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2807; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
2808; AVX-NEXT:    # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2809; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
2810; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
2811; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
2812; AVX-NEXT:    # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7]
2813; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2814; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4]
2815; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7]
2816; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2817; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload
2818; AVX-NEXT:    # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2819; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7]
2820; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
2821; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4]
2822; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7]
2823; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2824; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2825; AVX-NEXT:    # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7]
2826; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2827; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2828; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
2829; AVX-NEXT:    # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
2830; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2831; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
2832; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7]
2833; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2834; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
2835; AVX-NEXT:    # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2836; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
2837; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7]
2838; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4]
2839; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7]
2840; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2841; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
2842; AVX-NEXT:    # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2843; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
2844; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2845; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload
2846; AVX-NEXT:    # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
2847; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2848; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4]
2849; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7]
2850; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2851; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
2852; AVX-NEXT:    # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2853; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
2854; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7]
2855; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4]
2856; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7]
2857; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
2858; AVX-NEXT:    # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7]
2859; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
2860; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2861; AVX-NEXT:    vmovaps %ymm4, 192(%rsi)
2862; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2863; AVX-NEXT:    vmovaps %ymm4, 128(%rsi)
2864; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2865; AVX-NEXT:    vmovaps %ymm4, 64(%rsi)
2866; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2867; AVX-NEXT:    vmovaps %ymm4, (%rsi)
2868; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2869; AVX-NEXT:    vmovaps %ymm4, 224(%rsi)
2870; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2871; AVX-NEXT:    vmovaps %ymm4, 160(%rsi)
2872; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2873; AVX-NEXT:    vmovaps %ymm4, 96(%rsi)
2874; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2875; AVX-NEXT:    vmovaps %ymm4, 32(%rsi)
2876; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2877; AVX-NEXT:    vmovaps %ymm4, 192(%rdx)
2878; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2879; AVX-NEXT:    vmovaps %ymm4, 128(%rdx)
2880; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2881; AVX-NEXT:    vmovaps %ymm4, 64(%rdx)
2882; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2883; AVX-NEXT:    vmovaps %ymm4, (%rdx)
2884; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2885; AVX-NEXT:    vmovaps %ymm4, 224(%rdx)
2886; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2887; AVX-NEXT:    vmovaps %ymm4, 160(%rdx)
2888; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2889; AVX-NEXT:    vmovaps %ymm4, 96(%rdx)
2890; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2891; AVX-NEXT:    vmovaps %ymm4, 32(%rdx)
2892; AVX-NEXT:    vmovaps %ymm3, 192(%rcx)
2893; AVX-NEXT:    vmovaps %ymm5, 224(%rcx)
2894; AVX-NEXT:    vmovaps %ymm2, 128(%rcx)
2895; AVX-NEXT:    vmovaps %ymm1, 160(%rcx)
2896; AVX-NEXT:    vmovaps %ymm0, 64(%rcx)
2897; AVX-NEXT:    vmovaps %ymm14, 96(%rcx)
2898; AVX-NEXT:    vmovaps %ymm7, (%rcx)
2899; AVX-NEXT:    vmovaps %ymm13, 32(%rcx)
2900; AVX-NEXT:    addq $1384, %rsp # imm = 0x568
2901; AVX-NEXT:    vzeroupper
2902; AVX-NEXT:    retq
2903;
2904; AVX2-LABEL: load_i32_stride3_vf64:
2905; AVX2:       # %bb.0:
2906; AVX2-NEXT:    subq $1032, %rsp # imm = 0x408
2907; AVX2-NEXT:    vmovaps 736(%rdi), %ymm2
2908; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2909; AVX2-NEXT:    vmovaps 704(%rdi), %ymm3
2910; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2911; AVX2-NEXT:    vmovaps 672(%rdi), %ymm4
2912; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2913; AVX2-NEXT:    vmovaps 544(%rdi), %ymm5
2914; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2915; AVX2-NEXT:    vmovaps 512(%rdi), %ymm6
2916; AVX2-NEXT:    vmovups %ymm6, (%rsp) # 32-byte Spill
2917; AVX2-NEXT:    vmovaps 480(%rdi), %ymm7
2918; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2919; AVX2-NEXT:    vmovaps 352(%rdi), %ymm8
2920; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2921; AVX2-NEXT:    vmovaps 320(%rdi), %ymm10
2922; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2923; AVX2-NEXT:    vmovaps 288(%rdi), %ymm11
2924; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2925; AVX2-NEXT:    vmovaps 160(%rdi), %ymm9
2926; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2927; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
2928; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2929; AVX2-NEXT:    vmovaps 96(%rdi), %ymm1
2930; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2931; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2932; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7]
2933; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
2934; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm1
2935; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2936; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7]
2937; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
2938; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm1
2939; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2940; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
2941; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
2942; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm1
2943; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2944; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
2945; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
2946; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm1
2947; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2948; AVX2-NEXT:    vmovaps (%rdi), %ymm1
2949; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2950; AVX2-NEXT:    vmovaps 32(%rdi), %ymm3
2951; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2952; AVX2-NEXT:    vmovaps 64(%rdi), %ymm8
2953; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
2954; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7]
2955; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2956; AVX2-NEXT:    vpermps %ymm4, %ymm0, %ymm4
2957; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2958; AVX2-NEXT:    vmovaps 256(%rdi), %ymm1
2959; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2960; AVX2-NEXT:    vmovaps 224(%rdi), %ymm2
2961; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2962; AVX2-NEXT:    vmovaps 192(%rdi), %ymm3
2963; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2964; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
2965; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
2966; AVX2-NEXT:    vpermps %ymm6, %ymm0, %ymm6
2967; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2968; AVX2-NEXT:    vmovaps 448(%rdi), %ymm13
2969; AVX2-NEXT:    vmovaps 416(%rdi), %ymm12
2970; AVX2-NEXT:    vmovaps 384(%rdi), %ymm14
2971; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7]
2972; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2973; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2974; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
2975; AVX2-NEXT:    vpermps %ymm10, %ymm0, %ymm10
2976; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2977; AVX2-NEXT:    vmovaps 640(%rdi), %ymm6
2978; AVX2-NEXT:    vmovaps 608(%rdi), %ymm5
2979; AVX2-NEXT:    vmovaps 576(%rdi), %ymm7
2980; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7]
2981; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2982; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2983; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7]
2984; AVX2-NEXT:    vpermps %ymm15, %ymm0, %ymm0
2985; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2986; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2987; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2988; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7]
2989; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
2990; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7]
2991; AVX2-NEXT:    vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
2992; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
2993; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2994; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2995; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2996; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
2997; AVX2-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2998; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
2999; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3000; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3001; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3002; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3003; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3004; AVX2-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3005; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3006; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3007; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3008; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3009; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3010; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3011; AVX2-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3012; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3013; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3014; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3015; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3016; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3017; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3018; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
3019; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3020; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3021; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
3022; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3023; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
3024; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3025; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7]
3026; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3027; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3028; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7]
3029; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7]
3030; AVX2-NEXT:    vmovaps %ymm13, %ymm14
3031; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm13
3032; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
3033; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7]
3034; AVX2-NEXT:    vmovaps %ymm6, %ymm7
3035; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm15
3036; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
3037; AVX2-NEXT:    vmovaps {{.*#+}} ymm12 = [2,5,0,3,6,u,u,u]
3038; AVX2-NEXT:    vpermps %ymm0, %ymm12, %ymm0
3039; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm11[0,1,0,3,4,5,4,7]
3040; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
3041; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7]
3042; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3043; AVX2-NEXT:    vpermps %ymm0, %ymm12, %ymm0
3044; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
3045; AVX2-NEXT:    # ymm1 = mem[0,1,0,3,4,5,4,7]
3046; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
3047; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3048; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3049; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
3050; AVX2-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
3051; AVX2-NEXT:    vpermps %ymm1, %ymm12, %ymm1
3052; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
3053; AVX2-NEXT:    # ymm2 = mem[0,1,0,3,4,5,4,7]
3054; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
3055; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
3056; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
3057; AVX2-NEXT:    vpermps %ymm2, %ymm12, %ymm2
3058; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[0,1,0,3,4,5,4,7]
3059; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
3060; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
3061; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
3062; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3063; AVX2-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
3064; AVX2-NEXT:    vpermps %ymm3, %ymm12, %ymm3
3065; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
3066; AVX2-NEXT:    # ymm4 = mem[0,1,0,3,4,5,4,7]
3067; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
3068; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
3069; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3070; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3071; AVX2-NEXT:    # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7]
3072; AVX2-NEXT:    vpermps %ymm4, %ymm12, %ymm4
3073; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,1,0,3,4,5,4,7]
3074; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
3075; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
3076; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3077; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
3078; AVX2-NEXT:    # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
3079; AVX2-NEXT:    vpermps %ymm5, %ymm12, %ymm5
3080; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
3081; AVX2-NEXT:    # ymm6 = mem[0,1,0,3,4,5,4,7]
3082; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
3083; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
3084; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
3085; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
3086; AVX2-NEXT:    # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
3087; AVX2-NEXT:    vpermps %ymm6, %ymm12, %ymm6
3088; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
3089; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
3090; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
3091; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3092; AVX2-NEXT:    vmovaps %ymm7, 192(%rsi)
3093; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3094; AVX2-NEXT:    vmovaps %ymm7, 128(%rsi)
3095; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3096; AVX2-NEXT:    vmovaps %ymm7, 64(%rsi)
3097; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3098; AVX2-NEXT:    vmovaps %ymm7, (%rsi)
3099; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3100; AVX2-NEXT:    vmovaps %ymm7, 224(%rsi)
3101; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3102; AVX2-NEXT:    vmovaps %ymm7, 160(%rsi)
3103; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3104; AVX2-NEXT:    vmovaps %ymm7, 96(%rsi)
3105; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3106; AVX2-NEXT:    vmovaps %ymm7, 32(%rsi)
3107; AVX2-NEXT:    vmovaps %ymm15, 192(%rdx)
3108; AVX2-NEXT:    vmovaps %ymm13, 128(%rdx)
3109; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3110; AVX2-NEXT:    vmovaps %ymm7, 64(%rdx)
3111; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3112; AVX2-NEXT:    vmovaps %ymm7, (%rdx)
3113; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3114; AVX2-NEXT:    vmovaps %ymm7, 224(%rdx)
3115; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3116; AVX2-NEXT:    vmovaps %ymm7, 160(%rdx)
3117; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3118; AVX2-NEXT:    vmovaps %ymm7, 96(%rdx)
3119; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3120; AVX2-NEXT:    vmovaps %ymm7, 32(%rdx)
3121; AVX2-NEXT:    vmovaps %ymm6, 192(%rcx)
3122; AVX2-NEXT:    vmovaps %ymm5, 224(%rcx)
3123; AVX2-NEXT:    vmovaps %ymm4, 128(%rcx)
3124; AVX2-NEXT:    vmovaps %ymm3, 160(%rcx)
3125; AVX2-NEXT:    vmovaps %ymm2, 64(%rcx)
3126; AVX2-NEXT:    vmovaps %ymm1, 96(%rcx)
3127; AVX2-NEXT:    vmovaps %ymm0, (%rcx)
3128; AVX2-NEXT:    vmovaps %ymm9, 32(%rcx)
3129; AVX2-NEXT:    addq $1032, %rsp # imm = 0x408
3130; AVX2-NEXT:    vzeroupper
3131; AVX2-NEXT:    retq
3132;
3133; AVX2-FP-LABEL: load_i32_stride3_vf64:
3134; AVX2-FP:       # %bb.0:
3135; AVX2-FP-NEXT:    subq $1032, %rsp # imm = 0x408
3136; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm2
3137; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3138; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm3
3139; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3140; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm4
3141; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3142; AVX2-FP-NEXT:    vmovaps 544(%rdi), %ymm5
3143; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3144; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm6
3145; AVX2-FP-NEXT:    vmovups %ymm6, (%rsp) # 32-byte Spill
3146; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm7
3147; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3148; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm8
3149; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3150; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm10
3151; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3152; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm11
3153; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3154; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm9
3155; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3156; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
3157; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3158; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm1
3159; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3160; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3161; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7]
3162; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
3163; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3164; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3165; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7]
3166; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
3167; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3168; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3169; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
3170; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
3171; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3172; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3173; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
3174; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3175; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3176; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3177; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
3178; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3179; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm3
3180; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3181; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm8
3182; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
3183; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7]
3184; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3185; AVX2-FP-NEXT:    vpermps %ymm4, %ymm0, %ymm4
3186; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3187; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm1
3188; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3189; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm2
3190; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3191; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm3
3192; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3193; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
3194; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
3195; AVX2-FP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
3196; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3197; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm13
3198; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm12
3199; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm14
3200; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7]
3201; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3202; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3203; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
3204; AVX2-FP-NEXT:    vpermps %ymm10, %ymm0, %ymm10
3205; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3206; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm6
3207; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm5
3208; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm7
3209; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7]
3210; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3211; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3212; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7]
3213; AVX2-FP-NEXT:    vpermps %ymm15, %ymm0, %ymm0
3214; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3215; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3216; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3217; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7]
3218; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
3219; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7]
3220; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
3221; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3222; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3223; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3224; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3225; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3226; AVX2-FP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3227; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3228; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3229; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3230; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3231; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3232; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3233; AVX2-FP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3234; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3235; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3236; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3237; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3238; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3239; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3240; AVX2-FP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3241; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3242; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3243; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3244; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3245; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3246; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3247; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
3248; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3249; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3250; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
3251; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3252; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
3253; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3254; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7]
3255; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3256; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3257; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7]
3258; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7]
3259; AVX2-FP-NEXT:    vmovaps %ymm13, %ymm14
3260; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm13
3261; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
3262; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7]
3263; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm7
3264; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm15
3265; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
3266; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm12 = [2,5,0,3,6,u,u,u]
3267; AVX2-FP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
3268; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm11[0,1,0,3,4,5,4,7]
3269; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
3270; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7]
3271; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3272; AVX2-FP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
3273; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
3274; AVX2-FP-NEXT:    # ymm1 = mem[0,1,0,3,4,5,4,7]
3275; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
3276; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3277; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3278; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
3279; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
3280; AVX2-FP-NEXT:    vpermps %ymm1, %ymm12, %ymm1
3281; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
3282; AVX2-FP-NEXT:    # ymm2 = mem[0,1,0,3,4,5,4,7]
3283; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
3284; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
3285; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
3286; AVX2-FP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
3287; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[0,1,0,3,4,5,4,7]
3288; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
3289; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
3290; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
3291; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3292; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
3293; AVX2-FP-NEXT:    vpermps %ymm3, %ymm12, %ymm3
3294; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
3295; AVX2-FP-NEXT:    # ymm4 = mem[0,1,0,3,4,5,4,7]
3296; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
3297; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
3298; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3299; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3300; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7]
3301; AVX2-FP-NEXT:    vpermps %ymm4, %ymm12, %ymm4
3302; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,1,0,3,4,5,4,7]
3303; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
3304; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
3305; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3306; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
3307; AVX2-FP-NEXT:    # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
3308; AVX2-FP-NEXT:    vpermps %ymm5, %ymm12, %ymm5
3309; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
3310; AVX2-FP-NEXT:    # ymm6 = mem[0,1,0,3,4,5,4,7]
3311; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
3312; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
3313; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
3314; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
3315; AVX2-FP-NEXT:    # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
3316; AVX2-FP-NEXT:    vpermps %ymm6, %ymm12, %ymm6
3317; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
3318; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
3319; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
3320; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3321; AVX2-FP-NEXT:    vmovaps %ymm7, 192(%rsi)
3322; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3323; AVX2-FP-NEXT:    vmovaps %ymm7, 128(%rsi)
3324; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3325; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rsi)
3326; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3327; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
3328; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3329; AVX2-FP-NEXT:    vmovaps %ymm7, 224(%rsi)
3330; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3331; AVX2-FP-NEXT:    vmovaps %ymm7, 160(%rsi)
3332; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3333; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rsi)
3334; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3335; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
3336; AVX2-FP-NEXT:    vmovaps %ymm15, 192(%rdx)
3337; AVX2-FP-NEXT:    vmovaps %ymm13, 128(%rdx)
3338; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3339; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rdx)
3340; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3341; AVX2-FP-NEXT:    vmovaps %ymm7, (%rdx)
3342; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3343; AVX2-FP-NEXT:    vmovaps %ymm7, 224(%rdx)
3344; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3345; AVX2-FP-NEXT:    vmovaps %ymm7, 160(%rdx)
3346; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3347; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rdx)
3348; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3349; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rdx)
3350; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%rcx)
3351; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%rcx)
3352; AVX2-FP-NEXT:    vmovaps %ymm4, 128(%rcx)
3353; AVX2-FP-NEXT:    vmovaps %ymm3, 160(%rcx)
3354; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rcx)
3355; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rcx)
3356; AVX2-FP-NEXT:    vmovaps %ymm0, (%rcx)
3357; AVX2-FP-NEXT:    vmovaps %ymm9, 32(%rcx)
3358; AVX2-FP-NEXT:    addq $1032, %rsp # imm = 0x408
3359; AVX2-FP-NEXT:    vzeroupper
3360; AVX2-FP-NEXT:    retq
3361;
3362; AVX2-FCP-LABEL: load_i32_stride3_vf64:
3363; AVX2-FCP:       # %bb.0:
3364; AVX2-FCP-NEXT:    subq $1032, %rsp # imm = 0x408
3365; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm2
3366; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3367; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm3
3368; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3369; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm4
3370; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3371; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm5
3372; AVX2-FCP-NEXT:    vmovups %ymm5, (%rsp) # 32-byte Spill
3373; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm6
3374; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3375; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm7
3376; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3377; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm8
3378; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3379; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm9
3380; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm10
3381; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3382; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm11
3383; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3384; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm14
3385; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm13
3386; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7]
3387; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
3388; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
3389; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3390; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3391; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
3392; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
3393; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3394; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3395; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
3396; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
3397; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3398; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3399; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
3400; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3401; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3402; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3403; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm5
3404; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
3405; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3406; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm3
3407; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3408; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
3409; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3410; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
3411; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3412; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3413; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm2
3414; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3415; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm4
3416; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
3417; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3418; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7]
3419; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3420; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3421; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
3422; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3423; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm1
3424; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3425; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm2
3426; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3427; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm3
3428; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3429; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
3430; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
3431; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm1
3432; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3433; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm8
3434; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm7
3435; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm11
3436; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7]
3437; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3438; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3439; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7]
3440; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3441; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm0, %ymm0
3442; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3443; AVX2-FCP-NEXT:    vmovaps %ymm14, %ymm10
3444; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
3445; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3446; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7]
3447; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
3448; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3449; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3450; AVX2-FCP-NEXT:    vmovaps %ymm9, %ymm14
3451; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
3452; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
3453; AVX2-FCP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3454; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3455; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3456; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3457; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3458; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3459; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3460; AVX2-FCP-NEXT:    vblendps $73, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
3461; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3462; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3463; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3464; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3465; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3466; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3467; AVX2-FCP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3468; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3469; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3470; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3471; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3472; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
3473; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3474; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
3475; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3476; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3477; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3478; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
3479; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3480; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
3481; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3482; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3483; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
3484; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3485; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
3486; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3487; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
3488; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
3489; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3490; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
3491; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
3492; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm11
3493; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
3494; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7]
3495; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
3496; AVX2-FCP-NEXT:    # ymm8 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
3497; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7]
3498; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
3499; AVX2-FCP-NEXT:    # ymm8 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7]
3500; AVX2-FCP-NEXT:    vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
3501; AVX2-FCP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7]
3502; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
3503; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
3504; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
3505; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3506; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
3507; AVX2-FCP-NEXT:    # ymm7 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
3508; AVX2-FCP-NEXT:    vblendps $146, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
3509; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
3510; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
3511; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6],ymm9[7]
3512; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3513; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload
3514; AVX2-FCP-NEXT:    # ymm6 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
3515; AVX2-FCP-NEXT:    vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
3516; AVX2-FCP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
3517; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3518; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3519; AVX2-FCP-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
3520; AVX2-FCP-NEXT:    vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3521; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7]
3522; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm5 = [2,5,0,3,6,1,4,7]
3523; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
3524; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm5, %ymm4
3525; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm5, %ymm8
3526; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
3527; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
3528; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
3529; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm5, %ymm6
3530; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm5, %ymm3
3531; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3532; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%rsi)
3533; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3534; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rsi)
3535; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3536; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rsi)
3537; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3538; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
3539; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3540; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%rsi)
3541; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3542; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rsi)
3543; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3544; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rsi)
3545; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3546; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rsi)
3547; AVX2-FCP-NEXT:    vmovaps %ymm11, 192(%rdx)
3548; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3549; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rdx)
3550; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3551; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rdx)
3552; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3553; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rdx)
3554; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3555; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%rdx)
3556; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3557; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rdx)
3558; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3559; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rdx)
3560; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3561; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rdx)
3562; AVX2-FCP-NEXT:    vmovaps %ymm3, 192(%rcx)
3563; AVX2-FCP-NEXT:    vmovaps %ymm6, 224(%rcx)
3564; AVX2-FCP-NEXT:    vmovaps %ymm1, 128(%rcx)
3565; AVX2-FCP-NEXT:    vmovaps %ymm7, 160(%rcx)
3566; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rcx)
3567; AVX2-FCP-NEXT:    vmovaps %ymm8, 96(%rcx)
3568; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
3569; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rcx)
3570; AVX2-FCP-NEXT:    addq $1032, %rsp # imm = 0x408
3571; AVX2-FCP-NEXT:    vzeroupper
3572; AVX2-FCP-NEXT:    retq
3573;
3574; AVX512-LABEL: load_i32_stride3_vf64:
3575; AVX512:       # %bb.0:
3576; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm4
3577; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm5
3578; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm0
3579; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm6
3580; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm7
3581; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm1
3582; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm8
3583; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm9
3584; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm2
3585; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm10
3586; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm11
3587; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm3
3588; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3589; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm13
3590; AVX512-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
3591; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3592; AVX512-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
3593; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm15
3594; AVX512-NEXT:    vpermt2d %zmm7, %zmm12, %zmm15
3595; AVX512-NEXT:    vpermt2d %zmm6, %zmm14, %zmm15
3596; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm16
3597; AVX512-NEXT:    vpermt2d %zmm5, %zmm12, %zmm16
3598; AVX512-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
3599; AVX512-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
3600; AVX512-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
3601; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3602; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm17
3603; AVX512-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
3604; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3605; AVX512-NEXT:    vpermt2d %zmm6, %zmm18, %zmm17
3606; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm19
3607; AVX512-NEXT:    vpermt2d %zmm3, %zmm14, %zmm19
3608; AVX512-NEXT:    vpermt2d %zmm8, %zmm18, %zmm19
3609; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm20
3610; AVX512-NEXT:    vpermt2d %zmm0, %zmm14, %zmm20
3611; AVX512-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
3612; AVX512-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
3613; AVX512-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
3614; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3615; AVX512-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
3616; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3617; AVX512-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
3618; AVX512-NEXT:    vpermt2d %zmm5, %zmm18, %zmm0
3619; AVX512-NEXT:    vpermt2d %zmm4, %zmm9, %zmm0
3620; AVX512-NEXT:    vpermt2d %zmm7, %zmm18, %zmm1
3621; AVX512-NEXT:    vpermt2d %zmm6, %zmm9, %zmm1
3622; AVX512-NEXT:    vpermt2d %zmm10, %zmm18, %zmm2
3623; AVX512-NEXT:    vpermt2d %zmm11, %zmm9, %zmm2
3624; AVX512-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
3625; AVX512-NEXT:    vmovdqa64 %zmm15, 128(%rsi)
3626; AVX512-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
3627; AVX512-NEXT:    vmovdqa64 %zmm12, (%rsi)
3628; AVX512-NEXT:    vmovdqa64 %zmm20, 192(%rdx)
3629; AVX512-NEXT:    vmovdqa64 %zmm14, (%rdx)
3630; AVX512-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
3631; AVX512-NEXT:    vmovdqa64 %zmm17, 128(%rdx)
3632; AVX512-NEXT:    vmovdqa64 %zmm1, 128(%rcx)
3633; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rcx)
3634; AVX512-NEXT:    vmovdqa64 %zmm2, (%rcx)
3635; AVX512-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
3636; AVX512-NEXT:    vzeroupper
3637; AVX512-NEXT:    retq
3638;
3639; AVX512-FCP-LABEL: load_i32_stride3_vf64:
3640; AVX512-FCP:       # %bb.0:
3641; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm4
3642; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm5
3643; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm0
3644; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm6
3645; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
3646; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
3647; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm8
3648; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
3649; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
3650; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm10
3651; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm11
3652; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
3653; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3654; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
3655; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
3656; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3657; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
3658; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm15
3659; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm12, %zmm15
3660; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm14, %zmm15
3661; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
3662; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm12, %zmm16
3663; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
3664; AVX512-FCP-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
3665; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
3666; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3667; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm17
3668; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
3669; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3670; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm18, %zmm17
3671; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm19
3672; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm14, %zmm19
3673; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm18, %zmm19
3674; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm20
3675; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm20
3676; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
3677; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
3678; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
3679; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3680; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
3681; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3682; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
3683; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm18, %zmm0
3684; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm9, %zmm0
3685; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm18, %zmm1
3686; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm9, %zmm1
3687; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm18, %zmm2
3688; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm9, %zmm2
3689; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
3690; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, 128(%rsi)
3691; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
3692; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, (%rsi)
3693; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rdx)
3694; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
3695; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
3696; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 128(%rdx)
3697; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rcx)
3698; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rcx)
3699; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
3700; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
3701; AVX512-FCP-NEXT:    vzeroupper
3702; AVX512-FCP-NEXT:    retq
3703;
3704; AVX512DQ-LABEL: load_i32_stride3_vf64:
3705; AVX512DQ:       # %bb.0:
3706; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm4
3707; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm5
3708; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm0
3709; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm6
3710; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm7
3711; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm1
3712; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm8
3713; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm9
3714; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm2
3715; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm10
3716; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm11
3717; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm3
3718; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3719; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm13
3720; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
3721; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3722; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
3723; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm15
3724; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm12, %zmm15
3725; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm14, %zmm15
3726; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm16
3727; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm12, %zmm16
3728; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
3729; AVX512DQ-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
3730; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
3731; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3732; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm17
3733; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
3734; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3735; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm18, %zmm17
3736; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm19
3737; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm14, %zmm19
3738; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm18, %zmm19
3739; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm20
3740; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm14, %zmm20
3741; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
3742; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
3743; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
3744; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3745; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
3746; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3747; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
3748; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm18, %zmm0
3749; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm9, %zmm0
3750; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm18, %zmm1
3751; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm9, %zmm1
3752; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm18, %zmm2
3753; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm9, %zmm2
3754; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
3755; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 128(%rsi)
3756; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
3757; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%rsi)
3758; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 192(%rdx)
3759; AVX512DQ-NEXT:    vmovdqa64 %zmm14, (%rdx)
3760; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
3761; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 128(%rdx)
3762; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 128(%rcx)
3763; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 192(%rcx)
3764; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rcx)
3765; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
3766; AVX512DQ-NEXT:    vzeroupper
3767; AVX512DQ-NEXT:    retq
3768;
3769; AVX512DQ-FCP-LABEL: load_i32_stride3_vf64:
3770; AVX512DQ-FCP:       # %bb.0:
3771; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm4
3772; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm5
3773; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm0
3774; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm6
3775; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
3776; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
3777; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm8
3778; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
3779; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
3780; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm10
3781; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm11
3782; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
3783; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3784; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
3785; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
3786; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3787; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
3788; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm15
3789; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm12, %zmm15
3790; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm14, %zmm15
3791; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
3792; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm12, %zmm16
3793; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
3794; AVX512DQ-FCP-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
3795; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
3796; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3797; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm17
3798; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
3799; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3800; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm18, %zmm17
3801; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm19
3802; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm14, %zmm19
3803; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm18, %zmm19
3804; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm20
3805; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm20
3806; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
3807; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
3808; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
3809; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3810; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
3811; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3812; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
3813; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm18, %zmm0
3814; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm9, %zmm0
3815; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm18, %zmm1
3816; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm9, %zmm1
3817; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm18, %zmm2
3818; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm9, %zmm2
3819; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
3820; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 128(%rsi)
3821; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
3822; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, (%rsi)
3823; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rdx)
3824; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
3825; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
3826; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 128(%rdx)
3827; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rcx)
3828; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rcx)
3829; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
3830; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
3831; AVX512DQ-FCP-NEXT:    vzeroupper
3832; AVX512DQ-FCP-NEXT:    retq
3833;
3834; AVX512BW-LABEL: load_i32_stride3_vf64:
3835; AVX512BW:       # %bb.0:
3836; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm4
3837; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm5
3838; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm0
3839; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm6
3840; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
3841; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm1
3842; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm8
3843; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
3844; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
3845; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm10
3846; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm11
3847; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
3848; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3849; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
3850; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
3851; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3852; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
3853; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
3854; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm12, %zmm15
3855; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm14, %zmm15
3856; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16
3857; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm12, %zmm16
3858; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
3859; AVX512BW-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
3860; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
3861; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3862; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm17
3863; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
3864; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3865; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm18, %zmm17
3866; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm19
3867; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm14, %zmm19
3868; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm18, %zmm19
3869; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm20
3870; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm14, %zmm20
3871; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
3872; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
3873; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
3874; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3875; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
3876; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3877; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
3878; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm18, %zmm0
3879; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm9, %zmm0
3880; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm18, %zmm1
3881; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm9, %zmm1
3882; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm18, %zmm2
3883; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm9, %zmm2
3884; AVX512BW-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
3885; AVX512BW-NEXT:    vmovdqa64 %zmm15, 128(%rsi)
3886; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
3887; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%rsi)
3888; AVX512BW-NEXT:    vmovdqa64 %zmm20, 192(%rdx)
3889; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
3890; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
3891; AVX512BW-NEXT:    vmovdqa64 %zmm17, 128(%rdx)
3892; AVX512BW-NEXT:    vmovdqa64 %zmm1, 128(%rcx)
3893; AVX512BW-NEXT:    vmovdqa64 %zmm0, 192(%rcx)
3894; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rcx)
3895; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
3896; AVX512BW-NEXT:    vzeroupper
3897; AVX512BW-NEXT:    retq
3898;
3899; AVX512BW-FCP-LABEL: load_i32_stride3_vf64:
3900; AVX512BW-FCP:       # %bb.0:
3901; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm4
3902; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm5
3903; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm0
3904; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm6
3905; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
3906; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
3907; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm8
3908; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
3909; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
3910; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm10
3911; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm11
3912; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
3913; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3914; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
3915; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
3916; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3917; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
3918; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm15
3919; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm12, %zmm15
3920; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm14, %zmm15
3921; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
3922; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm12, %zmm16
3923; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
3924; AVX512BW-FCP-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
3925; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
3926; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3927; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm17
3928; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
3929; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3930; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm18, %zmm17
3931; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm19
3932; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm14, %zmm19
3933; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm18, %zmm19
3934; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm20
3935; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm20
3936; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
3937; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
3938; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
3939; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3940; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
3941; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3942; AVX512BW-FCP-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
3943; AVX512BW-FCP-NEXT:    vpermt2d %zmm5, %zmm18, %zmm0
3944; AVX512BW-FCP-NEXT:    vpermt2d %zmm4, %zmm9, %zmm0
3945; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm18, %zmm1
3946; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm9, %zmm1
3947; AVX512BW-FCP-NEXT:    vpermt2d %zmm10, %zmm18, %zmm2
3948; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm9, %zmm2
3949; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
3950; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, 128(%rsi)
3951; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
3952; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rsi)
3953; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rdx)
3954; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
3955; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
3956; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 128(%rdx)
3957; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rcx)
3958; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rcx)
3959; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
3960; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
3961; AVX512BW-FCP-NEXT:    vzeroupper
3962; AVX512BW-FCP-NEXT:    retq
3963;
3964; AVX512DQ-BW-LABEL: load_i32_stride3_vf64:
3965; AVX512DQ-BW:       # %bb.0:
3966; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm4
3967; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm5
3968; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm0
3969; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm6
3970; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
3971; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm1
3972; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm8
3973; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
3974; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm2
3975; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm10
3976; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm11
3977; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
3978; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3979; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm13
3980; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
3981; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3982; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
3983; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm15
3984; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm12, %zmm15
3985; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm14, %zmm15
3986; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm16
3987; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm12, %zmm16
3988; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
3989; AVX512DQ-BW-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
3990; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
3991; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3992; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm17
3993; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
3994; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3995; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm18, %zmm17
3996; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm19
3997; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm14, %zmm19
3998; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm18, %zmm19
3999; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm20
4000; AVX512DQ-BW-NEXT:    vpermt2d %zmm0, %zmm14, %zmm20
4001; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
4002; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
4003; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
4004; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
4005; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
4006; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
4007; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
4008; AVX512DQ-BW-NEXT:    vpermt2d %zmm5, %zmm18, %zmm0
4009; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm9, %zmm0
4010; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm18, %zmm1
4011; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm9, %zmm1
4012; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm18, %zmm2
4013; AVX512DQ-BW-NEXT:    vpermt2d %zmm11, %zmm9, %zmm2
4014; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
4015; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 128(%rsi)
4016; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
4017; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, (%rsi)
4018; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 192(%rdx)
4019; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
4020; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
4021; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 128(%rdx)
4022; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 128(%rcx)
4023; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 192(%rcx)
4024; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rcx)
4025; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
4026; AVX512DQ-BW-NEXT:    vzeroupper
4027; AVX512DQ-BW-NEXT:    retq
4028;
4029; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf64:
4030; AVX512DQ-BW-FCP:       # %bb.0:
4031; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm4
4032; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm5
4033; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm0
4034; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm6
4035; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
4036; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
4037; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm8
4038; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm9
4039; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
4040; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm10
4041; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm11
4042; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm3
4043; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
4044; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
4045; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
4046; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
4047; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
4048; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm15
4049; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm12, %zmm15
4050; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm14, %zmm15
4051; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
4052; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm12, %zmm16
4053; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
4054; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
4055; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
4056; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
4057; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm17
4058; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
4059; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
4060; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm18, %zmm17
4061; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm19
4062; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm14, %zmm19
4063; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm18, %zmm19
4064; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm20
4065; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm20
4066; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
4067; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
4068; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
4069; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
4070; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
4071; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
4072; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
4073; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm5, %zmm18, %zmm0
4074; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm4, %zmm9, %zmm0
4075; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm18, %zmm1
4076; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm9, %zmm1
4077; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm10, %zmm18, %zmm2
4078; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm9, %zmm2
4079; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
4080; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, 128(%rsi)
4081; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
4082; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rsi)
4083; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rdx)
4084; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
4085; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
4086; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 128(%rdx)
4087; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rcx)
4088; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rcx)
4089; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
4090; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
4091; AVX512DQ-BW-FCP-NEXT:    vzeroupper
4092; AVX512DQ-BW-FCP-NEXT:    retq
4093  %wide.vec = load <192 x i32>, ptr %in.vec, align 64
4094  %strided.vec0 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
4095  %strided.vec1 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
4096  %strided.vec2 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
4097  store <64 x i32> %strided.vec0, ptr %out.vec0, align 64
4098  store <64 x i32> %strided.vec1, ptr %out.vec1, align 64
4099  store <64 x i32> %strided.vec2, ptr %out.vec2, align 64
4100  ret void
4101}
4102