xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll (revision 7457f51f6cf61b960e3e6e45e63378debd5c1d5c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
19; SSE-LABEL: load_i16_stride5_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movdqa (%rdi), %xmm0
22; SSE-NEXT:    movdqa 16(%rdi), %xmm1
23; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
24; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
25; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
26; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
27; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
28; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
29; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
30; SSE-NEXT:    psrlq $48, %xmm0
31; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
32; SSE-NEXT:    psrld $16, %xmm1
33; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
34; SSE-NEXT:    movd %xmm2, (%rsi)
35; SSE-NEXT:    movd %xmm3, (%rdx)
36; SSE-NEXT:    movd %xmm4, (%rcx)
37; SSE-NEXT:    movd %xmm0, (%r8)
38; SSE-NEXT:    movd %xmm5, (%r9)
39; SSE-NEXT:    retq
40;
41; AVX-LABEL: load_i16_stride5_vf2:
42; AVX:       # %bb.0:
43; AVX-NEXT:    vmovdqa (%rdi), %xmm0
44; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
45; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
46; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
47; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
48; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
49; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
50; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
51; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm5
52; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
53; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
54; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
55; AVX-NEXT:    vmovd %xmm2, (%rsi)
56; AVX-NEXT:    vmovd %xmm3, (%rdx)
57; AVX-NEXT:    vmovd %xmm4, (%rcx)
58; AVX-NEXT:    vmovd %xmm5, (%r8)
59; AVX-NEXT:    vmovd %xmm0, (%r9)
60; AVX-NEXT:    retq
61;
62; AVX2-LABEL: load_i16_stride5_vf2:
63; AVX2:       # %bb.0:
64; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
65; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
66; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
67; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
68; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
69; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
70; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
71; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
72; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
73; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
74; AVX2-NEXT:    vpbroadcastw 8(%rdi), %xmm5
75; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
76; AVX2-NEXT:    vmovd %xmm2, (%rsi)
77; AVX2-NEXT:    vmovd %xmm3, (%rdx)
78; AVX2-NEXT:    vmovd %xmm4, (%rcx)
79; AVX2-NEXT:    vmovd %xmm0, (%r8)
80; AVX2-NEXT:    vmovd %xmm1, (%r9)
81; AVX2-NEXT:    retq
82;
83; AVX2-FP-LABEL: load_i16_stride5_vf2:
84; AVX2-FP:       # %bb.0:
85; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
86; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
87; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
88; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
89; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
90; AVX2-FP-NEXT:    vpsrlq $48, %xmm0, %xmm0
91; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
92; AVX2-FP-NEXT:    vpbroadcastw 8(%rdi), %xmm5
93; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
94; AVX2-FP-NEXT:    vmovd %xmm2, (%rsi)
95; AVX2-FP-NEXT:    vmovd %xmm3, (%rdx)
96; AVX2-FP-NEXT:    vmovd %xmm4, (%rcx)
97; AVX2-FP-NEXT:    vmovd %xmm0, (%r8)
98; AVX2-FP-NEXT:    vmovd %xmm1, (%r9)
99; AVX2-FP-NEXT:    retq
100;
101; AVX2-FCP-LABEL: load_i16_stride5_vf2:
102; AVX2-FCP:       # %bb.0:
103; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
104; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
105; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
106; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
107; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
108; AVX2-FCP-NEXT:    vpsrlq $48, %xmm0, %xmm0
109; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
110; AVX2-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm5
111; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
112; AVX2-FCP-NEXT:    vmovd %xmm2, (%rsi)
113; AVX2-FCP-NEXT:    vmovd %xmm3, (%rdx)
114; AVX2-FCP-NEXT:    vmovd %xmm4, (%rcx)
115; AVX2-FCP-NEXT:    vmovd %xmm0, (%r8)
116; AVX2-FCP-NEXT:    vmovd %xmm1, (%r9)
117; AVX2-FCP-NEXT:    retq
118;
119; AVX512-LABEL: load_i16_stride5_vf2:
120; AVX512:       # %bb.0:
121; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
122; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
123; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
124; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
125; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
126; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
127; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
128; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
129; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
130; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
131; AVX512-NEXT:    vpbroadcastw 8(%rdi), %xmm5
132; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
133; AVX512-NEXT:    vmovd %xmm2, (%rsi)
134; AVX512-NEXT:    vmovd %xmm3, (%rdx)
135; AVX512-NEXT:    vmovd %xmm4, (%rcx)
136; AVX512-NEXT:    vmovd %xmm0, (%r8)
137; AVX512-NEXT:    vmovd %xmm1, (%r9)
138; AVX512-NEXT:    retq
139;
140; AVX512-FCP-LABEL: load_i16_stride5_vf2:
141; AVX512-FCP:       # %bb.0:
142; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
143; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
144; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
145; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
146; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
147; AVX512-FCP-NEXT:    vpsrlq $48, %xmm0, %xmm0
148; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
149; AVX512-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm5
150; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
151; AVX512-FCP-NEXT:    vmovd %xmm2, (%rsi)
152; AVX512-FCP-NEXT:    vmovd %xmm3, (%rdx)
153; AVX512-FCP-NEXT:    vmovd %xmm4, (%rcx)
154; AVX512-FCP-NEXT:    vmovd %xmm0, (%r8)
155; AVX512-FCP-NEXT:    vmovd %xmm1, (%r9)
156; AVX512-FCP-NEXT:    retq
157;
158; AVX512DQ-LABEL: load_i16_stride5_vf2:
159; AVX512DQ:       # %bb.0:
160; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
161; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
162; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
163; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
164; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
165; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
166; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
167; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
168; AVX512DQ-NEXT:    vpsrlq $48, %xmm0, %xmm0
169; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
170; AVX512DQ-NEXT:    vpbroadcastw 8(%rdi), %xmm5
171; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
172; AVX512DQ-NEXT:    vmovd %xmm2, (%rsi)
173; AVX512DQ-NEXT:    vmovd %xmm3, (%rdx)
174; AVX512DQ-NEXT:    vmovd %xmm4, (%rcx)
175; AVX512DQ-NEXT:    vmovd %xmm0, (%r8)
176; AVX512DQ-NEXT:    vmovd %xmm1, (%r9)
177; AVX512DQ-NEXT:    retq
178;
179; AVX512DQ-FCP-LABEL: load_i16_stride5_vf2:
180; AVX512DQ-FCP:       # %bb.0:
181; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
182; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
183; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
184; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
185; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
186; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm0, %xmm0
187; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
188; AVX512DQ-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm5
189; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
190; AVX512DQ-FCP-NEXT:    vmovd %xmm2, (%rsi)
191; AVX512DQ-FCP-NEXT:    vmovd %xmm3, (%rdx)
192; AVX512DQ-FCP-NEXT:    vmovd %xmm4, (%rcx)
193; AVX512DQ-FCP-NEXT:    vmovd %xmm0, (%r8)
194; AVX512DQ-FCP-NEXT:    vmovd %xmm1, (%r9)
195; AVX512DQ-FCP-NEXT:    retq
196;
197; AVX512BW-LABEL: load_i16_stride5_vf2:
198; AVX512BW:       # %bb.0:
199; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
200; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
201; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
202; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
203; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
204; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
205; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
206; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
207; AVX512BW-NEXT:    vpsrlq $48, %xmm0, %xmm0
208; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
209; AVX512BW-NEXT:    vpbroadcastw 8(%rdi), %xmm5
210; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
211; AVX512BW-NEXT:    vmovd %xmm2, (%rsi)
212; AVX512BW-NEXT:    vmovd %xmm3, (%rdx)
213; AVX512BW-NEXT:    vmovd %xmm4, (%rcx)
214; AVX512BW-NEXT:    vmovd %xmm0, (%r8)
215; AVX512BW-NEXT:    vmovd %xmm1, (%r9)
216; AVX512BW-NEXT:    retq
217;
218; AVX512BW-FCP-LABEL: load_i16_stride5_vf2:
219; AVX512BW-FCP:       # %bb.0:
220; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
221; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
222; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
223; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
224; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
225; AVX512BW-FCP-NEXT:    vpsrlq $48, %xmm0, %xmm0
226; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
227; AVX512BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm5
228; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
229; AVX512BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
230; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%rdx)
231; AVX512BW-FCP-NEXT:    vmovd %xmm4, (%rcx)
232; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%r8)
233; AVX512BW-FCP-NEXT:    vmovd %xmm1, (%r9)
234; AVX512BW-FCP-NEXT:    retq
235;
236; AVX512DQ-BW-LABEL: load_i16_stride5_vf2:
237; AVX512DQ-BW:       # %bb.0:
238; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
239; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
240; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
241; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
242; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
243; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
244; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
245; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
246; AVX512DQ-BW-NEXT:    vpsrlq $48, %xmm0, %xmm0
247; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
248; AVX512DQ-BW-NEXT:    vpbroadcastw 8(%rdi), %xmm5
249; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
250; AVX512DQ-BW-NEXT:    vmovd %xmm2, (%rsi)
251; AVX512DQ-BW-NEXT:    vmovd %xmm3, (%rdx)
252; AVX512DQ-BW-NEXT:    vmovd %xmm4, (%rcx)
253; AVX512DQ-BW-NEXT:    vmovd %xmm0, (%r8)
254; AVX512DQ-BW-NEXT:    vmovd %xmm1, (%r9)
255; AVX512DQ-BW-NEXT:    retq
256;
257; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf2:
258; AVX512DQ-BW-FCP:       # %bb.0:
259; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
260; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
261; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
262; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
263; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
264; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %xmm0, %xmm0
265; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
266; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm5
267; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
268; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
269; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%rdx)
270; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm4, (%rcx)
271; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%r8)
272; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm1, (%r9)
273; AVX512DQ-BW-FCP-NEXT:    retq
274  %wide.vec = load <10 x i16>, ptr %in.vec, align 64
275  %strided.vec0 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 0, i32 5>
276  %strided.vec1 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 1, i32 6>
277  %strided.vec2 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 2, i32 7>
278  %strided.vec3 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 3, i32 8>
279  %strided.vec4 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 4, i32 9>
280  store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
281  store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
282  store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
283  store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
284  store <2 x i16> %strided.vec4, ptr %out.vec4, align 64
285  ret void
286}
287
288define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
289; SSE-LABEL: load_i16_stride5_vf4:
290; SSE:       # %bb.0:
291; SSE-NEXT:    movdqa (%rdi), %xmm2
292; SSE-NEXT:    movdqa 16(%rdi), %xmm3
293; SSE-NEXT:    movdqa 32(%rdi), %xmm0
294; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
295; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[2,1,2,3,4,5,6,7]
296; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
297; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
298; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
299; SSE-NEXT:    movdqa %xmm3, %xmm4
300; SSE-NEXT:    psrlq $48, %xmm4
301; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,3,2,3]
302; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7]
303; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
304; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
305; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,3,2,1]
306; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
307; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
308; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
309; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3]
310; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
311; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
312; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
313; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0]
314; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
315; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
316; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,2,0,3,4,5,6,7]
317; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1]
318; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
319; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
320; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm7[0,3,2,3,4,5,6,7]
321; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
322; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
323; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
324; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
325; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
326; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
327; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0]
328; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
329; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
330; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
331; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
332; SSE-NEXT:    pand %xmm7, %xmm2
333; SSE-NEXT:    pandn %xmm0, %xmm7
334; SSE-NEXT:    por %xmm2, %xmm7
335; SSE-NEXT:    movq %xmm1, (%rsi)
336; SSE-NEXT:    movq %xmm4, (%rdx)
337; SSE-NEXT:    movq %xmm5, (%rcx)
338; SSE-NEXT:    movq %xmm6, (%r8)
339; SSE-NEXT:    movq %xmm7, (%r9)
340; SSE-NEXT:    retq
341;
342; AVX-LABEL: load_i16_stride5_vf4:
343; AVX:       # %bb.0:
344; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
345; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
346; AVX-NEXT:    vmovdqa (%rdi), %xmm1
347; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
348; AVX-NEXT:    vmovdqa 32(%rdi), %xmm3
349; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
350; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
351; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
352; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm4
353; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3]
354; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7]
355; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
356; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
357; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u]
358; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3]
359; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
360; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
361; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
362; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u]
363; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
364; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u]
365; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
366; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
367; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
368; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
369; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
370; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7]
371; AVX-NEXT:    vmovq %xmm0, (%rsi)
372; AVX-NEXT:    vmovq %xmm4, (%rdx)
373; AVX-NEXT:    vmovq %xmm5, (%rcx)
374; AVX-NEXT:    vmovq %xmm6, (%r8)
375; AVX-NEXT:    vmovq %xmm1, (%r9)
376; AVX-NEXT:    retq
377;
378; AVX2-LABEL: load_i16_stride5_vf4:
379; AVX2:       # %bb.0:
380; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
381; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
382; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
383; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3]
384; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
385; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
386; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
387; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
388; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
389; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
390; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
391; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
392; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
393; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
394; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
395; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
396; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
397; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
398; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
399; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
400; AVX2-NEXT:    vmovq %xmm3, (%rsi)
401; AVX2-NEXT:    vmovq %xmm4, (%rdx)
402; AVX2-NEXT:    vmovq %xmm5, (%rcx)
403; AVX2-NEXT:    vmovq %xmm6, (%r8)
404; AVX2-NEXT:    vmovq %xmm0, (%r9)
405; AVX2-NEXT:    retq
406;
407; AVX2-FP-LABEL: load_i16_stride5_vf4:
408; AVX2-FP:       # %bb.0:
409; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
410; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
411; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm2
412; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
413; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
414; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
415; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
416; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
417; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
418; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
419; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
420; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
421; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
422; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
423; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
424; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
425; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
426; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
427; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
428; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
429; AVX2-FP-NEXT:    vmovq %xmm5, (%rcx)
430; AVX2-FP-NEXT:    vmovq %xmm6, (%r8)
431; AVX2-FP-NEXT:    vmovq %xmm0, (%r9)
432; AVX2-FP-NEXT:    retq
433;
434; AVX2-FCP-LABEL: load_i16_stride5_vf4:
435; AVX2-FCP:       # %bb.0:
436; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
437; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
438; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
439; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
440; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
441; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
442; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
443; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
444; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
445; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
446; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
447; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
448; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
449; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
450; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
451; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
452; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
453; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
454; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
455; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
456; AVX2-FCP-NEXT:    vmovq %xmm5, (%rcx)
457; AVX2-FCP-NEXT:    vmovq %xmm6, (%r8)
458; AVX2-FCP-NEXT:    vmovq %xmm0, (%r9)
459; AVX2-FCP-NEXT:    retq
460;
461; AVX512-LABEL: load_i16_stride5_vf4:
462; AVX512:       # %bb.0:
463; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
464; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
465; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
466; AVX512-NEXT:    vpextrw $5, %xmm0, %eax
467; AVX512-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm3
468; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
469; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
470; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
471; AVX512-NEXT:    vpextrw $6, %xmm0, %eax
472; AVX512-NEXT:    vpextrw $1, %xmm0, %r10d
473; AVX512-NEXT:    vmovd %r10d, %xmm4
474; AVX512-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
475; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
476; AVX512-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm1
477; AVX512-NEXT:    vmovd %xmm2, %eax
478; AVX512-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
479; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm4
480; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
481; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
482; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
483; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
484; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
485; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
486; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
487; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
488; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
489; AVX512-NEXT:    vmovq %xmm3, (%rsi)
490; AVX512-NEXT:    vmovq %xmm1, (%rdx)
491; AVX512-NEXT:    vmovq %xmm5, (%rcx)
492; AVX512-NEXT:    vmovq %xmm6, (%r8)
493; AVX512-NEXT:    vmovq %xmm0, (%r9)
494; AVX512-NEXT:    retq
495;
496; AVX512-FCP-LABEL: load_i16_stride5_vf4:
497; AVX512-FCP:       # %bb.0:
498; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
499; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
500; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
501; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
502; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
503; AVX512-FCP-NEXT:    vpextrw $7, %xmm1, %eax
504; AVX512-FCP-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
505; AVX512-FCP-NEXT:    vpextrw $3, %xmm1, %eax
506; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
507; AVX512-FCP-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
508; AVX512-FCP-NEXT:    vmovd %xmm2, %eax
509; AVX512-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
510; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm4
511; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
512; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
513; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
514; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
515; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
516; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
517; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
518; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
519; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
520; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
521; AVX512-FCP-NEXT:    vmovq %xmm1, (%rdx)
522; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
523; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
524; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
525; AVX512-FCP-NEXT:    retq
526;
527; AVX512DQ-LABEL: load_i16_stride5_vf4:
528; AVX512DQ:       # %bb.0:
529; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
530; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
531; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
532; AVX512DQ-NEXT:    vpextrw $5, %xmm0, %eax
533; AVX512DQ-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm3
534; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
535; AVX512DQ-NEXT:    vpextrw $7, %xmm1, %eax
536; AVX512DQ-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
537; AVX512DQ-NEXT:    vpextrw $6, %xmm0, %eax
538; AVX512DQ-NEXT:    vpextrw $1, %xmm0, %r10d
539; AVX512DQ-NEXT:    vmovd %r10d, %xmm4
540; AVX512DQ-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
541; AVX512DQ-NEXT:    vpextrw $3, %xmm1, %eax
542; AVX512DQ-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm1
543; AVX512DQ-NEXT:    vmovd %xmm2, %eax
544; AVX512DQ-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
545; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm4
546; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
547; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
548; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
549; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
550; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
551; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
552; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
553; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
554; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
555; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
556; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
557; AVX512DQ-NEXT:    vmovq %xmm5, (%rcx)
558; AVX512DQ-NEXT:    vmovq %xmm6, (%r8)
559; AVX512DQ-NEXT:    vmovq %xmm0, (%r9)
560; AVX512DQ-NEXT:    retq
561;
562; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4:
563; AVX512DQ-FCP:       # %bb.0:
564; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
565; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
566; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
567; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
568; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
569; AVX512DQ-FCP-NEXT:    vpextrw $7, %xmm1, %eax
570; AVX512DQ-FCP-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
571; AVX512DQ-FCP-NEXT:    vpextrw $3, %xmm1, %eax
572; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
573; AVX512DQ-FCP-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
574; AVX512DQ-FCP-NEXT:    vmovd %xmm2, %eax
575; AVX512DQ-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
576; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm4
577; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
578; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
579; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
580; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
581; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
582; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
583; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
584; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
585; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
586; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
587; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rdx)
588; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
589; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
590; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
591; AVX512DQ-FCP-NEXT:    retq
592;
593; AVX512BW-LABEL: load_i16_stride5_vf4:
594; AVX512BW:       # %bb.0:
595; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
596; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
597; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
598; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
599; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
600; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm3
601; AVX512BW-NEXT:    vpextrw $7, %xmm3, %eax
602; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
603; AVX512BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
604; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
605; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
606; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
607; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
608; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
609; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm1
610; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
611; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
612; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
613; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
614; AVX512BW-NEXT:    vmovq %xmm1, (%r9)
615; AVX512BW-NEXT:    vzeroupper
616; AVX512BW-NEXT:    retq
617;
618; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
619; AVX512BW-FCP:       # %bb.0:
620; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
621; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
622; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
623; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
624; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
625; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
626; AVX512BW-FCP-NEXT:    vpextrw $7, %xmm3, %eax
627; AVX512BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
628; AVX512BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
629; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
630; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
631; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
632; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
633; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
634; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm1
635; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
636; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
637; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
638; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
639; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%r9)
640; AVX512BW-FCP-NEXT:    vzeroupper
641; AVX512BW-FCP-NEXT:    retq
642;
643; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
644; AVX512DQ-BW:       # %bb.0:
645; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
646; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
647; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
648; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
649; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
650; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm3
651; AVX512DQ-BW-NEXT:    vpextrw $7, %xmm3, %eax
652; AVX512DQ-BW-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
653; AVX512DQ-BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
654; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
655; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
656; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
657; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
658; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
659; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm1
660; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
661; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
662; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
663; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
664; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%r9)
665; AVX512DQ-BW-NEXT:    vzeroupper
666; AVX512DQ-BW-NEXT:    retq
667;
668; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
669; AVX512DQ-BW-FCP:       # %bb.0:
670; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
671; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
672; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
673; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
674; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
675; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
676; AVX512DQ-BW-FCP-NEXT:    vpextrw $7, %xmm3, %eax
677; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
678; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
679; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
680; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
681; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
682; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
683; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
684; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm1
685; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
686; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
687; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
688; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
689; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%r9)
690; AVX512DQ-BW-FCP-NEXT:    vzeroupper
691; AVX512DQ-BW-FCP-NEXT:    retq
692  %wide.vec = load <20 x i16>, ptr %in.vec, align 64
693  %strided.vec0 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
694  %strided.vec1 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
695  %strided.vec2 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
696  %strided.vec3 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
697  %strided.vec4 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
698  store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
699  store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
700  store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
701  store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
702  store <4 x i16> %strided.vec4, ptr %out.vec4, align 64
703  ret void
704}
705
706define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
707; SSE-LABEL: load_i16_stride5_vf8:
708; SSE:       # %bb.0:
709; SSE-NEXT:    movdqa 64(%rdi), %xmm6
710; SSE-NEXT:    movdqa (%rdi), %xmm4
711; SSE-NEXT:    movdqa 16(%rdi), %xmm3
712; SSE-NEXT:    movdqa 32(%rdi), %xmm0
713; SSE-NEXT:    movdqa 48(%rdi), %xmm5
714; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
715; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
716; SSE-NEXT:    pand %xmm1, %xmm2
717; SSE-NEXT:    pandn %xmm0, %xmm1
718; SSE-NEXT:    por %xmm2, %xmm1
719; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
720; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
721; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3]
722; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
723; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
724; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3]
725; SSE-NEXT:    movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
726; SSE-NEXT:    andps %xmm1, %xmm7
727; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
728; SSE-NEXT:    movaps %xmm1, %xmm2
729; SSE-NEXT:    pandn %xmm8, %xmm2
730; SSE-NEXT:    por %xmm7, %xmm2
731; SSE-NEXT:    movdqa %xmm3, %xmm7
732; SSE-NEXT:    psrlq $48, %xmm7
733; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3]
734; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
735; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
736; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535]
737; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3]
738; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
739; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
740; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7]
741; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
742; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7]
743; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,4,7]
744; SSE-NEXT:    pand %xmm7, %xmm9
745; SSE-NEXT:    pandn %xmm8, %xmm7
746; SSE-NEXT:    por %xmm9, %xmm7
747; SSE-NEXT:    pand %xmm1, %xmm7
748; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,1,2,0]
749; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
750; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,1,1,3]
751; SSE-NEXT:    psllq $48, %xmm6
752; SSE-NEXT:    pandn %xmm6, %xmm1
753; SSE-NEXT:    por %xmm7, %xmm1
754; SSE-NEXT:    movdqa %xmm5, %xmm7
755; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
756; SSE-NEXT:    movdqa %xmm5, %xmm12
757; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0]
758; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3]
759; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[0,0,1,3]
760; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535]
761; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,1,1,3]
762; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
763; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
764; SSE-NEXT:    pand %xmm13, %xmm5
765; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1]
766; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3]
767; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1]
768; SSE-NEXT:    movdqa %xmm13, %xmm15
769; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm14[0,3,2,3,4,5,6,7]
770; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
771; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[1,0,3,3,4,5,6,7]
772; SSE-NEXT:    pand %xmm13, %xmm11
773; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
774; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
775; SSE-NEXT:    movdqa %xmm13, %xmm4
776; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
777; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
778; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
779; SSE-NEXT:    pand %xmm13, %xmm3
780; SSE-NEXT:    pandn %xmm12, %xmm13
781; SSE-NEXT:    por %xmm13, %xmm5
782; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7]
783; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
784; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,1],xmm12[2,3]
785; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
786; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7]
787; SSE-NEXT:    pandn %xmm8, %xmm15
788; SSE-NEXT:    por %xmm15, %xmm11
789; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0]
790; SSE-NEXT:    pandn %xmm0, %xmm4
791; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2]
792; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
793; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,6]
794; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[2,3]
795; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,1],xmm7[2,0]
796; SSE-NEXT:    por %xmm4, %xmm3
797; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,3,4,5,6,7]
798; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,5,4,7]
799; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3]
800; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
801; SSE-NEXT:    movdqa %xmm2, (%rsi)
802; SSE-NEXT:    movdqa %xmm1, (%rdx)
803; SSE-NEXT:    movaps %xmm5, (%rcx)
804; SSE-NEXT:    movaps %xmm11, (%r8)
805; SSE-NEXT:    movaps %xmm3, (%r9)
806; SSE-NEXT:    retq
807;
808; AVX-LABEL: load_i16_stride5_vf8:
809; AVX:       # %bb.0:
810; AVX-NEXT:    vmovdqa (%rdi), %xmm0
811; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
812; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
813; AVX-NEXT:    vmovdqa 48(%rdi), %xmm3
814; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,3]
815; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6,7]
816; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3]
817; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
818; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
819; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
820; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
821; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
822; AVX-NEXT:    vmovdqa 64(%rdi), %xmm5
823; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
824; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm6[7]
825; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm6
826; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3]
827; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
828; AVX-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
829; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
830; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,10,11,4,5,14,15,u,u]
831; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4,5,6,7]
832; AVX-NEXT:    vpsllq $48, %xmm5, %xmm7
833; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7]
834; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,1,1,3]
835; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
836; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
837; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
838; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
839; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7]
840; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[0,1,2,0]
841; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
842; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
843; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
844; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15]
845; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
846; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
847; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7]
848; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5],xmm8[6,7]
849; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,3]
850; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
851; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7]
852; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
853; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
854; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
855; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
856; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
857; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,12,13,14,15]
858; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4,5],xmm0[6,7]
859; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[0,1,1,3]
860; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
861; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
862; AVX-NEXT:    vmovdqa %xmm4, (%rsi)
863; AVX-NEXT:    vmovdqa %xmm6, (%rdx)
864; AVX-NEXT:    vmovdqa %xmm7, (%rcx)
865; AVX-NEXT:    vmovdqa %xmm8, (%r8)
866; AVX-NEXT:    vmovdqa %xmm0, (%r9)
867; AVX-NEXT:    retq
868;
869; AVX2-LABEL: load_i16_stride5_vf8:
870; AVX2:       # %bb.0:
871; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
872; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
873; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
874; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
875; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
876; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
877; AVX2-NEXT:    vpbroadcastw 70(%rdi), %xmm3
878; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
879; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
880; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
881; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
882; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
883; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm4
884; AVX2-NEXT:    vpsllq $48, %xmm4, %xmm5
885; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7]
886; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15]
887; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
888; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
889; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
890; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,0]
891; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5]
892; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
893; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
894; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
895; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
896; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
897; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,3]
898; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
899; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
900; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
901; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
902; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
903; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
904; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3]
905; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
906; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3]
907; AVX2-NEXT:    vmovdqa %xmm1, (%rsi)
908; AVX2-NEXT:    vmovdqa %xmm3, (%rdx)
909; AVX2-NEXT:    vmovdqa %xmm5, (%rcx)
910; AVX2-NEXT:    vmovdqa %xmm6, (%r8)
911; AVX2-NEXT:    vmovdqa %xmm0, (%r9)
912; AVX2-NEXT:    vzeroupper
913; AVX2-NEXT:    retq
914;
915; AVX2-FP-LABEL: load_i16_stride5_vf8:
916; AVX2-FP:       # %bb.0:
917; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
918; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
919; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
920; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
921; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
922; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
923; AVX2-FP-NEXT:    vpbroadcastw 70(%rdi), %xmm3
924; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
925; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
926; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
927; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
928; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
929; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm4
930; AVX2-FP-NEXT:    vpsllq $48, %xmm4, %xmm5
931; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7]
932; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11]
933; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
934; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
935; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
936; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
937; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
938; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
939; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
940; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
941; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
942; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
943; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
944; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
945; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
946; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
947; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
948; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
949; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
950; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
951; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
952; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
953; AVX2-FP-NEXT:    vmovdqa %xmm2, (%rsi)
954; AVX2-FP-NEXT:    vmovdqa %xmm3, (%rdx)
955; AVX2-FP-NEXT:    vmovdqa %xmm5, (%rcx)
956; AVX2-FP-NEXT:    vmovdqa %xmm6, (%r8)
957; AVX2-FP-NEXT:    vmovdqa %xmm0, (%r9)
958; AVX2-FP-NEXT:    vzeroupper
959; AVX2-FP-NEXT:    retq
960;
961; AVX2-FCP-LABEL: load_i16_stride5_vf8:
962; AVX2-FCP:       # %bb.0:
963; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
964; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
965; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
966; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
967; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
968; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
969; AVX2-FCP-NEXT:    vpbroadcastw 70(%rdi), %xmm3
970; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
971; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
972; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
973; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
974; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
975; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
976; AVX2-FCP-NEXT:    vpsllq $48, %xmm4, %xmm5
977; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7]
978; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11]
979; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
980; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
981; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
982; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
983; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
984; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
985; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
986; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
987; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
988; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
989; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
990; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
991; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
992; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
993; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
994; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
995; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
996; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
997; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
998; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
999; AVX2-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1000; AVX2-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
1001; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
1002; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%r8)
1003; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%r9)
1004; AVX2-FCP-NEXT:    vzeroupper
1005; AVX2-FCP-NEXT:    retq
1006;
1007; AVX512-LABEL: load_i16_stride5_vf8:
1008; AVX512:       # %bb.0:
1009; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
1010; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
1011; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
1012; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
1013; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
1014; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
1015; AVX512-NEXT:    vpbroadcastw 70(%rdi), %xmm3
1016; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
1017; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm3
1018; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
1019; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
1020; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
1021; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
1022; AVX512-NEXT:    vpsllq $48, %xmm3, %xmm5
1023; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
1024; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15]
1025; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
1026; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
1027; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1028; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,0]
1029; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5]
1030; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1031; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
1032; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
1033; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
1034; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1035; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
1036; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
1037; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
1038; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1039; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
1040; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
1041; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1042; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
1043; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1044; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3]
1045; AVX512-NEXT:    vmovdqa %xmm1, (%rsi)
1046; AVX512-NEXT:    vmovdqa %xmm4, (%rdx)
1047; AVX512-NEXT:    vmovdqa %xmm5, (%rcx)
1048; AVX512-NEXT:    vmovdqa %xmm6, (%r8)
1049; AVX512-NEXT:    vmovdqa %xmm0, (%r9)
1050; AVX512-NEXT:    vzeroupper
1051; AVX512-NEXT:    retq
1052;
1053; AVX512-FCP-LABEL: load_i16_stride5_vf8:
1054; AVX512-FCP:       # %bb.0:
1055; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1056; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1057; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1058; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1059; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
1060; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
1061; AVX512-FCP-NEXT:    vpbroadcastw 70(%rdi), %xmm3
1062; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
1063; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm3
1064; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1065; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1066; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
1067; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
1068; AVX512-FCP-NEXT:    vpsllq $48, %xmm3, %xmm5
1069; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
1070; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11]
1071; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
1072; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1073; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
1074; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
1075; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
1076; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1077; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
1078; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
1079; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1080; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
1081; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
1082; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
1083; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
1084; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
1085; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
1086; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1087; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1088; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
1089; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
1090; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
1091; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1092; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rdx)
1093; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
1094; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r8)
1095; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%r9)
1096; AVX512-FCP-NEXT:    vzeroupper
1097; AVX512-FCP-NEXT:    retq
1098;
1099; AVX512DQ-LABEL: load_i16_stride5_vf8:
1100; AVX512DQ:       # %bb.0:
1101; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
1102; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm2
1103; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
1104; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
1105; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
1106; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
1107; AVX512DQ-NEXT:    vpbroadcastw 70(%rdi), %xmm3
1108; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
1109; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm3
1110; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
1111; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
1112; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
1113; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
1114; AVX512DQ-NEXT:    vpsllq $48, %xmm3, %xmm5
1115; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
1116; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15]
1117; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
1118; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
1119; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1120; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,0]
1121; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5]
1122; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1123; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
1124; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
1125; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
1126; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1127; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
1128; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
1129; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
1130; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1131; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
1132; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
1133; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1134; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
1135; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1136; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3]
1137; AVX512DQ-NEXT:    vmovdqa %xmm1, (%rsi)
1138; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rdx)
1139; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rcx)
1140; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r8)
1141; AVX512DQ-NEXT:    vmovdqa %xmm0, (%r9)
1142; AVX512DQ-NEXT:    vzeroupper
1143; AVX512DQ-NEXT:    retq
1144;
1145; AVX512DQ-FCP-LABEL: load_i16_stride5_vf8:
1146; AVX512DQ-FCP:       # %bb.0:
1147; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1148; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1149; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1150; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1151; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
1152; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
1153; AVX512DQ-FCP-NEXT:    vpbroadcastw 70(%rdi), %xmm3
1154; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
1155; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm3
1156; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1157; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1158; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
1159; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
1160; AVX512DQ-FCP-NEXT:    vpsllq $48, %xmm3, %xmm5
1161; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
1162; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11]
1163; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
1164; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1165; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
1166; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
1167; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
1168; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1169; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
1170; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
1171; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1172; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
1173; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
1174; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
1175; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
1176; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
1177; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
1178; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1179; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1180; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
1181; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
1182; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
1183; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1184; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rdx)
1185; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
1186; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r8)
1187; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%r9)
1188; AVX512DQ-FCP-NEXT:    vzeroupper
1189; AVX512DQ-FCP-NEXT:    retq
1190;
1191; AVX512BW-LABEL: load_i16_stride5_vf8:
1192; AVX512BW:       # %bb.0:
1193; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1194; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1195; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35]
1196; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1197; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36]
1198; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1199; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37]
1200; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
1201; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38]
1202; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
1203; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39]
1204; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
1205; AVX512BW-NEXT:    vmovdqa %xmm2, (%rsi)
1206; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
1207; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
1208; AVX512BW-NEXT:    vmovdqa %xmm5, (%r8)
1209; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
1210; AVX512BW-NEXT:    vzeroupper
1211; AVX512BW-NEXT:    retq
1212;
1213; AVX512BW-FCP-LABEL: load_i16_stride5_vf8:
1214; AVX512BW-FCP:       # %bb.0:
1215; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1216; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1217; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35]
1218; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1219; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36]
1220; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1221; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37]
1222; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
1223; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38]
1224; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
1225; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39]
1226; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
1227; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1228; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
1229; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
1230; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
1231; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
1232; AVX512BW-FCP-NEXT:    vzeroupper
1233; AVX512BW-FCP-NEXT:    retq
1234;
1235; AVX512DQ-BW-LABEL: load_i16_stride5_vf8:
1236; AVX512DQ-BW:       # %bb.0:
1237; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1238; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1239; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35]
1240; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1241; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36]
1242; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1243; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37]
1244; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
1245; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38]
1246; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
1247; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39]
1248; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
1249; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rsi)
1250; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
1251; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
1252; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%r8)
1253; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
1254; AVX512DQ-BW-NEXT:    vzeroupper
1255; AVX512DQ-BW-NEXT:    retq
1256;
1257; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf8:
1258; AVX512DQ-BW-FCP:       # %bb.0:
1259; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1260; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1261; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35]
1262; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1263; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36]
1264; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1265; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37]
1266; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
1267; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38]
1268; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
1269; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39]
1270; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
1271; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1272; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
1273; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
1274; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
1275; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
1276; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1277; AVX512DQ-BW-FCP-NEXT:    retq
1278  %wide.vec = load <40 x i16>, ptr %in.vec, align 64
1279  %strided.vec0 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35>
1280  %strided.vec1 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36>
1281  %strided.vec2 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37>
1282  %strided.vec3 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38>
1283  %strided.vec4 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39>
1284  store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
1285  store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
1286  store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
1287  store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
1288  store <8 x i16> %strided.vec4, ptr %out.vec4, align 64
1289  ret void
1290}
1291
1292define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
1293; SSE-LABEL: load_i16_stride5_vf16:
1294; SSE:       # %bb.0:
1295; SSE-NEXT:    movdqa 144(%rdi), %xmm14
1296; SSE-NEXT:    movdqa 80(%rdi), %xmm8
1297; SSE-NEXT:    movdqa 96(%rdi), %xmm7
1298; SSE-NEXT:    movdqa 128(%rdi), %xmm15
1299; SSE-NEXT:    movdqa 112(%rdi), %xmm12
1300; SSE-NEXT:    movdqa 64(%rdi), %xmm10
1301; SSE-NEXT:    movdqa (%rdi), %xmm11
1302; SSE-NEXT:    movdqa 16(%rdi), %xmm9
1303; SSE-NEXT:    movdqa 32(%rdi), %xmm13
1304; SSE-NEXT:    movdqa 48(%rdi), %xmm5
1305; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
1306; SSE-NEXT:    movdqa %xmm0, %xmm1
1307; SSE-NEXT:    pandn %xmm13, %xmm1
1308; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
1309; SSE-NEXT:    pand %xmm0, %xmm2
1310; SSE-NEXT:    por %xmm1, %xmm2
1311; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3]
1312; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1313; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
1314; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7]
1315; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1316; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
1317; SSE-NEXT:    movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
1318; SSE-NEXT:    andps %xmm6, %xmm4
1319; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1]
1320; SSE-NEXT:    movaps %xmm6, %xmm2
1321; SSE-NEXT:    pandn %xmm1, %xmm2
1322; SSE-NEXT:    por %xmm4, %xmm2
1323; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1324; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[0,1,0,3]
1325; SSE-NEXT:    pand %xmm0, %xmm1
1326; SSE-NEXT:    pandn %xmm12, %xmm0
1327; SSE-NEXT:    por %xmm1, %xmm0
1328; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3]
1329; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1330; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
1331; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
1332; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1333; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
1334; SSE-NEXT:    andps %xmm6, %xmm2
1335; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,1,0,1]
1336; SSE-NEXT:    movaps %xmm6, %xmm1
1337; SSE-NEXT:    andnps %xmm0, %xmm1
1338; SSE-NEXT:    orps %xmm2, %xmm1
1339; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1340; SSE-NEXT:    movdqa %xmm9, %xmm0
1341; SSE-NEXT:    psrlq $48, %xmm0
1342; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3]
1343; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
1344; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1345; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535]
1346; SSE-NEXT:    movdqa %xmm0, %xmm2
1347; SSE-NEXT:    pandn %xmm1, %xmm2
1348; SSE-NEXT:    movdqa %xmm5, %xmm3
1349; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3]
1350; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
1351; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1352; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7]
1353; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1354; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1355; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
1356; SSE-NEXT:    pand %xmm0, %xmm1
1357; SSE-NEXT:    por %xmm2, %xmm1
1358; SSE-NEXT:    movdqa %xmm10, %xmm5
1359; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1360; SSE-NEXT:    movdqa %xmm10, %xmm2
1361; SSE-NEXT:    psllq $48, %xmm2
1362; SSE-NEXT:    movaps %xmm6, %xmm4
1363; SSE-NEXT:    andnps %xmm2, %xmm4
1364; SSE-NEXT:    pand %xmm6, %xmm1
1365; SSE-NEXT:    orps %xmm1, %xmm4
1366; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1367; SSE-NEXT:    movdqa %xmm7, %xmm1
1368; SSE-NEXT:    psrlq $48, %xmm1
1369; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3]
1370; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
1371; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1372; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3]
1373; SSE-NEXT:    movdqa %xmm15, %xmm10
1374; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
1375; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1376; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7]
1377; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1378; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1379; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
1380; SSE-NEXT:    pand %xmm0, %xmm1
1381; SSE-NEXT:    pandn %xmm2, %xmm0
1382; SSE-NEXT:    por %xmm1, %xmm0
1383; SSE-NEXT:    pand %xmm6, %xmm0
1384; SSE-NEXT:    movdqa %xmm14, %xmm4
1385; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1386; SSE-NEXT:    movdqa %xmm14, %xmm1
1387; SSE-NEXT:    psllq $48, %xmm1
1388; SSE-NEXT:    pandn %xmm1, %xmm6
1389; SSE-NEXT:    por %xmm0, %xmm6
1390; SSE-NEXT:    movdqa %xmm3, %xmm0
1391; SSE-NEXT:    movdqa %xmm3, %xmm14
1392; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1393; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0]
1394; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3]
1395; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
1396; SSE-NEXT:    movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535]
1397; SSE-NEXT:    movaps %xmm3, %xmm1
1398; SSE-NEXT:    andnps %xmm0, %xmm1
1399; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm11[0,1,1,3]
1400; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,7,6,7]
1401; SSE-NEXT:    punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3]
1402; SSE-NEXT:    pand %xmm3, %xmm15
1403; SSE-NEXT:    por %xmm1, %xmm15
1404; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1405; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0]
1406; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1407; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
1408; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,0]
1409; SSE-NEXT:    movdqa %xmm10, %xmm5
1410; SSE-NEXT:    movdqa %xmm10, %xmm1
1411; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0]
1412; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3]
1413; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
1414; SSE-NEXT:    movaps %xmm3, %xmm2
1415; SSE-NEXT:    andnps %xmm1, %xmm2
1416; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,1,1,3]
1417; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1418; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
1419; SSE-NEXT:    pand %xmm3, %xmm0
1420; SSE-NEXT:    por %xmm2, %xmm0
1421; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1422; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,1,2,0]
1423; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
1424; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
1425; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1426; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
1427; SSE-NEXT:    movdqa %xmm3, %xmm2
1428; SSE-NEXT:    pandn %xmm1, %xmm2
1429; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
1430; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
1431; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1432; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
1433; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1434; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
1435; SSE-NEXT:    pand %xmm3, %xmm1
1436; SSE-NEXT:    por %xmm2, %xmm1
1437; SSE-NEXT:    movdqa %xmm14, %xmm4
1438; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[3,0]
1439; SSE-NEXT:    movdqa %xmm3, %xmm2
1440; SSE-NEXT:    pandn %xmm13, %xmm2
1441; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2]
1442; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,4,6,7]
1443; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1444; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm10[0,1,0,3]
1445; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,6]
1446; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[2,3]
1447; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0]
1448; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7]
1449; SSE-NEXT:    movdqa %xmm3, %xmm14
1450; SSE-NEXT:    pandn %xmm4, %xmm14
1451; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1]
1452; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3]
1453; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
1454; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm13[0,3,2,3,4,5,6,7]
1455; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1456; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm4[1,0,3,3,4,5,6,7]
1457; SSE-NEXT:    pand %xmm3, %xmm13
1458; SSE-NEXT:    por %xmm14, %xmm13
1459; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[3,0]
1460; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm11[0,2]
1461; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0]
1462; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2]
1463; SSE-NEXT:    movdqa %xmm5, %xmm11
1464; SSE-NEXT:    movdqa %xmm5, %xmm4
1465; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[3,0]
1466; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,6,6,7]
1467; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
1468; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7]
1469; SSE-NEXT:    pand %xmm3, %xmm8
1470; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
1471; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1472; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7]
1473; SSE-NEXT:    pand %xmm3, %xmm7
1474; SSE-NEXT:    pandn %xmm12, %xmm3
1475; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[0,2]
1476; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,4,6,7]
1477; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1478; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[0,1,0,3]
1479; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6]
1480; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3]
1481; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0]
1482; SSE-NEXT:    por %xmm2, %xmm8
1483; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1484; SSE-NEXT:    # xmm2 = mem[0,2,2,3]
1485; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
1486; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1487; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
1488; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3]
1489; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0]
1490; SSE-NEXT:    por %xmm7, %xmm3
1491; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3]
1492; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,1,1,3]
1493; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1494; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
1495; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3]
1496; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
1497; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1498; SSE-NEXT:    movaps %xmm2, 16(%rsi)
1499; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1500; SSE-NEXT:    movaps %xmm2, (%rsi)
1501; SSE-NEXT:    movdqa %xmm6, 16(%rdx)
1502; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1503; SSE-NEXT:    movaps %xmm2, (%rdx)
1504; SSE-NEXT:    movaps %xmm0, 16(%rcx)
1505; SSE-NEXT:    movaps %xmm15, (%rcx)
1506; SSE-NEXT:    movaps %xmm13, 16(%r8)
1507; SSE-NEXT:    movaps %xmm1, (%r8)
1508; SSE-NEXT:    movaps %xmm3, 16(%r9)
1509; SSE-NEXT:    movaps %xmm8, (%r9)
1510; SSE-NEXT:    retq
1511;
1512; AVX-LABEL: load_i16_stride5_vf16:
1513; AVX:       # %bb.0:
1514; AVX-NEXT:    vmovdqa 96(%rdi), %xmm0
1515; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
1516; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,7]
1517; AVX-NEXT:    vmovdqa 112(%rdi), %xmm1
1518; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm1[1]
1519; AVX-NEXT:    vmovdqa 80(%rdi), %xmm2
1520; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
1521; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
1522; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7]
1523; AVX-NEXT:    vmovdqa 144(%rdi), %xmm8
1524; AVX-NEXT:    vmovdqa 128(%rdi), %xmm7
1525; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
1526; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
1527; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7]
1528; AVX-NEXT:    vmovdqa (%rdi), %xmm3
1529; AVX-NEXT:    vmovdqa 16(%rdi), %xmm4
1530; AVX-NEXT:    vmovdqa 32(%rdi), %xmm5
1531; AVX-NEXT:    vmovdqa 48(%rdi), %xmm6
1532; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
1533; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm5[4],xmm9[5,6,7]
1534; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[3,1,2,3]
1535; AVX-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7]
1536; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3]
1537; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
1538; AVX-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1539; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
1540; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
1541; AVX-NEXT:    vandps %ymm11, %ymm9, %ymm12
1542; AVX-NEXT:    vmovaps 64(%rdi), %xmm9
1543; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1]
1544; AVX-NEXT:    vandnps %ymm13, %ymm11, %ymm13
1545; AVX-NEXT:    vorps %ymm13, %ymm12, %ymm12
1546; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm10
1547; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1548; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,0,4,5,6,7]
1549; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
1550; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3]
1551; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
1552; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
1553; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
1554; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
1555; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
1556; AVX-NEXT:    vpsllq $48, %xmm9, %xmm13
1557; AVX-NEXT:    vandnps %ymm13, %ymm11, %ymm13
1558; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm14
1559; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm3[0,3,2,3]
1560; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7]
1561; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1562; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1563; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7]
1564; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
1565; AVX-NEXT:    vandps %ymm11, %ymm14, %ymm11
1566; AVX-NEXT:    vorps %ymm13, %ymm11, %ymm11
1567; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm11, %ymm11
1568; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
1569; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u]
1570; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3]
1571; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
1572; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
1573; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
1574; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
1575; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
1576; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3]
1577; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7]
1578; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
1579; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
1580; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
1581; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7]
1582; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,0]
1583; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5]
1584; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7]
1585; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
1586; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1587; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u]
1588; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm14
1589; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7]
1590; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7]
1591; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
1592; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7]
1593; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
1594; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
1595; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7]
1596; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
1597; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
1598; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
1599; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
1600; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
1601; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
1602; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
1603; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
1604; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7]
1605; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1606; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
1607; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
1608; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1609; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
1610; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1611; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
1612; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7]
1613; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
1614; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
1615; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1616; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1617; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
1618; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
1619; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
1620; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3]
1621; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1622; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
1623; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1624; AVX-NEXT:    vmovaps %ymm10, (%rsi)
1625; AVX-NEXT:    vmovaps %ymm11, (%rdx)
1626; AVX-NEXT:    vmovaps %ymm12, (%rcx)
1627; AVX-NEXT:    vmovaps %ymm13, (%r8)
1628; AVX-NEXT:    vmovaps %ymm0, (%r9)
1629; AVX-NEXT:    vzeroupper
1630; AVX-NEXT:    retq
1631;
1632; AVX2-LABEL: load_i16_stride5_vf16:
1633; AVX2:       # %bb.0:
1634; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
1635; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm3
1636; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
1637; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
1638; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1639; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
1640; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
1641; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
1642; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1643; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
1644; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
1645; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
1646; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0]
1647; AVX2-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
1648; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm6
1649; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm4
1650; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3]
1651; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
1652; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
1653; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
1654; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
1655; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
1656; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
1657; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
1658; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
1659; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1660; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1661; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
1662; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
1663; AVX2-NEXT:    vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
1664; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
1665; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
1666; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
1667; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
1668; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
1669; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
1670; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
1671; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
1672; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1673; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1674; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1675; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
1676; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
1677; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
1678; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
1679; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
1680; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
1681; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
1682; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
1683; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
1684; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm10
1685; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1686; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1687; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1688; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1689; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
1690; AVX2-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
1691; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1692; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
1693; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
1694; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
1695; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
1696; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1697; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
1698; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
1699; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1700; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1701; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1702; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1703; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
1704; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
1705; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1706; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
1707; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
1708; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
1709; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1710; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1711; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
1712; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1713; AVX2-NEXT:    vmovdqa %ymm5, (%rsi)
1714; AVX2-NEXT:    vmovdqa %ymm7, (%rdx)
1715; AVX2-NEXT:    vmovdqa %ymm8, (%rcx)
1716; AVX2-NEXT:    vmovdqa %ymm9, (%r8)
1717; AVX2-NEXT:    vmovdqa %ymm0, (%r9)
1718; AVX2-NEXT:    vzeroupper
1719; AVX2-NEXT:    retq
1720;
1721; AVX2-FP-LABEL: load_i16_stride5_vf16:
1722; AVX2-FP:       # %bb.0:
1723; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
1724; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm3
1725; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
1726; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
1727; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1728; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1729; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
1730; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
1731; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1732; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
1733; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
1734; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
1735; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0]
1736; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm6
1737; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm4
1738; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm5
1739; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0],xmm4[1],xmm5[2,3]
1740; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
1741; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
1742; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
1743; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
1744; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
1745; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
1746; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
1747; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
1748; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1749; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1750; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
1751; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
1752; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
1753; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3]
1754; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
1755; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
1756; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
1757; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
1758; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
1759; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
1760; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
1761; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1762; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1763; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1764; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
1765; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
1766; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
1767; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm5[1],xmm4[2,3]
1768; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
1769; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
1770; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
1771; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
1772; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
1773; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm10
1774; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1775; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1776; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1777; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1778; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
1779; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
1780; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1781; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3]
1782; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
1783; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
1784; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
1785; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1786; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
1787; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1788; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1789; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1790; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1791; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1792; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
1793; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
1794; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1795; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u]
1796; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u]
1797; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1798; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
1799; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1800; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rsi)
1801; AVX2-FP-NEXT:    vmovdqa %ymm7, (%rdx)
1802; AVX2-FP-NEXT:    vmovdqa %ymm8, (%rcx)
1803; AVX2-FP-NEXT:    vmovdqa %ymm9, (%r8)
1804; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r9)
1805; AVX2-FP-NEXT:    vzeroupper
1806; AVX2-FP-NEXT:    retq
1807;
1808; AVX2-FCP-LABEL: load_i16_stride5_vf16:
1809; AVX2-FCP:       # %bb.0:
1810; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
1811; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
1812; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
1813; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
1814; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm2
1815; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
1816; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1817; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
1818; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
1819; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15]
1820; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3]
1821; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm7, %ymm6
1822; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
1823; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0]
1824; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
1825; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0]
1826; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm6, %ymm6
1827; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
1828; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
1829; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
1830; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
1831; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
1832; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm9
1833; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7]
1834; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
1835; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
1836; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,4,7,1,6]
1837; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm10, %ymm9
1838; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
1839; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm9, %ymm6
1840; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0]
1841; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm8, %ymm9
1842; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
1843; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
1844; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
1845; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
1846; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
1847; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
1848; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
1849; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1850; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
1851; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4]
1852; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
1853; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
1854; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1855; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0]
1856; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
1857; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm10, %ymm10
1858; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
1859; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
1860; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1861; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
1862; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
1863; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1864; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1865; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
1866; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
1867; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
1868; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
1869; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1870; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0]
1871; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
1872; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm10, %ymm10
1873; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
1874; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
1875; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1876; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
1877; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1878; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
1879; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1880; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
1881; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5]
1882; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
1883; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
1884; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
1885; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,0,2,5,7]
1886; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
1887; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
1888; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
1889; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%rsi)
1890; AVX2-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
1891; AVX2-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
1892; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%r8)
1893; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r9)
1894; AVX2-FCP-NEXT:    vzeroupper
1895; AVX2-FCP-NEXT:    retq
1896;
1897; AVX512-LABEL: load_i16_stride5_vf16:
1898; AVX512:       # %bb.0:
1899; AVX512-NEXT:    vmovdqa (%rdi), %ymm2
1900; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm3
1901; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm0
1902; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm1
1903; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1904; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
1905; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7]
1906; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
1907; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1908; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
1909; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
1910; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
1911; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm5
1912; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm6
1913; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm4
1914; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3]
1915; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
1916; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
1917; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
1918; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
1919; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1920; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
1921; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
1922; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
1923; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
1924; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
1925; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
1926; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
1927; AVX512-NEXT:    vpor %ymm7, %ymm8, %ymm7
1928; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
1929; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
1930; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
1931; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
1932; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
1933; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
1934; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
1935; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
1936; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1937; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1938; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1939; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
1940; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
1941; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
1942; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
1943; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
1944; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
1945; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
1946; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
1947; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
1948; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm10
1949; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1950; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1951; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1952; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1953; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
1954; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
1955; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1956; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
1957; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
1958; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
1959; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
1960; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1961; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
1962; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
1963; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1964; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1965; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1966; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1967; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
1968; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
1969; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1970; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
1971; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
1972; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
1973; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1974; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1975; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
1976; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1977; AVX512-NEXT:    vmovdqa %ymm5, (%rsi)
1978; AVX512-NEXT:    vmovdqa %ymm7, (%rdx)
1979; AVX512-NEXT:    vmovdqa %ymm8, (%rcx)
1980; AVX512-NEXT:    vmovdqa %ymm9, (%r8)
1981; AVX512-NEXT:    vmovdqa %ymm0, (%r9)
1982; AVX512-NEXT:    vzeroupper
1983; AVX512-NEXT:    retq
1984;
1985; AVX512-FCP-LABEL: load_i16_stride5_vf16:
1986; AVX512-FCP:       # %bb.0:
1987; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
1988; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
1989; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
1990; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
1991; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1992; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3]
1993; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm4
1994; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
1995; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1996; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1997; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
1998; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
1999; AVX512-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm5
2000; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0]
2001; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
2002; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm6, %ymm6
2003; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
2004; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
2005; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
2006; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2007; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
2008; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,0,0,0,4,7,1,6]
2009; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm8, %ymm6
2010; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
2011; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
2012; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
2013; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
2014; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
2015; AVX512-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm6
2016; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0]
2017; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm8, %ymm9
2018; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
2019; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
2020; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
2021; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
2022; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
2023; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
2024; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
2025; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
2026; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2027; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4]
2028; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
2029; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
2030; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2031; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0]
2032; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
2033; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
2034; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
2035; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
2036; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
2037; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
2038; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
2039; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2040; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
2041; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
2042; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
2043; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
2044; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
2045; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2046; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0]
2047; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
2048; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
2049; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
2050; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
2051; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2052; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
2053; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
2054; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
2055; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
2056; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
2057; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5]
2058; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2059; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
2060; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
2061; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7]
2062; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
2063; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
2064; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
2065; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
2066; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%rsi)
2067; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
2068; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
2069; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r8)
2070; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r9)
2071; AVX512-FCP-NEXT:    vzeroupper
2072; AVX512-FCP-NEXT:    retq
2073;
2074; AVX512DQ-LABEL: load_i16_stride5_vf16:
2075; AVX512DQ:       # %bb.0:
2076; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm2
2077; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
2078; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm0
2079; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm1
2080; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
2081; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
2082; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7]
2083; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
2084; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
2085; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
2086; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
2087; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
2088; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm5
2089; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm6
2090; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm4
2091; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3]
2092; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
2093; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
2094; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
2095; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
2096; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
2097; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
2098; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
2099; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
2100; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
2101; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
2102; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
2103; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
2104; AVX512DQ-NEXT:    vpor %ymm7, %ymm8, %ymm7
2105; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
2106; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
2107; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2108; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
2109; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
2110; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
2111; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
2112; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
2113; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
2114; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2115; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
2116; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
2117; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
2118; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
2119; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
2120; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
2121; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
2122; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
2123; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
2124; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
2125; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm10
2126; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2127; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
2128; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
2129; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
2130; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
2131; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
2132; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2133; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
2134; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
2135; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
2136; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
2137; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
2138; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
2139; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
2140; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
2141; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
2142; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
2143; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2144; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
2145; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
2146; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
2147; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
2148; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
2149; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2150; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
2151; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2152; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
2153; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2154; AVX512DQ-NEXT:    vmovdqa %ymm5, (%rsi)
2155; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rdx)
2156; AVX512DQ-NEXT:    vmovdqa %ymm8, (%rcx)
2157; AVX512DQ-NEXT:    vmovdqa %ymm9, (%r8)
2158; AVX512DQ-NEXT:    vmovdqa %ymm0, (%r9)
2159; AVX512DQ-NEXT:    vzeroupper
2160; AVX512DQ-NEXT:    retq
2161;
2162; AVX512DQ-FCP-LABEL: load_i16_stride5_vf16:
2163; AVX512DQ-FCP:       # %bb.0:
2164; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
2165; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
2166; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
2167; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
2168; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
2169; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3]
2170; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm4
2171; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
2172; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
2173; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
2174; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
2175; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
2176; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm5
2177; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0]
2178; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
2179; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm6, %ymm6
2180; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
2181; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
2182; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
2183; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2184; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
2185; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,0,0,0,4,7,1,6]
2186; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm8, %ymm6
2187; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
2188; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
2189; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
2190; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
2191; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
2192; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm6
2193; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0]
2194; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm8, %ymm9
2195; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
2196; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
2197; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
2198; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
2199; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
2200; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
2201; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
2202; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
2203; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2204; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4]
2205; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
2206; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
2207; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2208; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0]
2209; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
2210; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
2211; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
2212; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
2213; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
2214; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
2215; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
2216; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2217; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
2218; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
2219; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
2220; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
2221; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
2222; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2223; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0]
2224; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
2225; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
2226; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
2227; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
2228; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2229; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
2230; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
2231; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
2232; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
2233; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
2234; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5]
2235; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2236; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
2237; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
2238; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7]
2239; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
2240; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
2241; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
2242; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
2243; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%rsi)
2244; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
2245; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
2246; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r8)
2247; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r9)
2248; AVX512DQ-FCP-NEXT:    vzeroupper
2249; AVX512DQ-FCP-NEXT:    retq
2250;
2251; AVX512BW-LABEL: load_i16_stride5_vf16:
2252; AVX512BW:       # %bb.0:
2253; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2254; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
2255; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
2256; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2257; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27]
2258; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm4
2259; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
2260; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
2261; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2262; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28]
2263; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm5
2264; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
2265; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
2266; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29]
2267; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm6
2268; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
2269; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
2270; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30]
2271; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm7
2272; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
2273; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2274; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31]
2275; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm0
2276; AVX512BW-NEXT:    vmovdqa %ymm3, (%rsi)
2277; AVX512BW-NEXT:    vmovdqa %ymm5, (%rdx)
2278; AVX512BW-NEXT:    vmovdqa %ymm6, (%rcx)
2279; AVX512BW-NEXT:    vmovdqa %ymm7, (%r8)
2280; AVX512BW-NEXT:    vmovdqa %ymm0, (%r9)
2281; AVX512BW-NEXT:    vzeroupper
2282; AVX512BW-NEXT:    retq
2283;
2284; AVX512BW-FCP-LABEL: load_i16_stride5_vf16:
2285; AVX512BW-FCP:       # %bb.0:
2286; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
2287; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
2288; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
2289; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2290; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27]
2291; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
2292; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
2293; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
2294; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2295; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28]
2296; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm5
2297; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
2298; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
2299; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29]
2300; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm6
2301; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
2302; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
2303; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30]
2304; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm7
2305; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
2306; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2307; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31]
2308; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm0
2309; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rsi)
2310; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
2311; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
2312; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
2313; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%r9)
2314; AVX512BW-FCP-NEXT:    vzeroupper
2315; AVX512BW-FCP-NEXT:    retq
2316;
2317; AVX512DQ-BW-LABEL: load_i16_stride5_vf16:
2318; AVX512DQ-BW:       # %bb.0:
2319; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2320; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
2321; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
2322; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2323; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27]
2324; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm4
2325; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
2326; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
2327; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2328; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28]
2329; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm5
2330; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
2331; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
2332; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29]
2333; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm6
2334; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
2335; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
2336; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30]
2337; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm7
2338; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
2339; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2340; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31]
2341; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm0
2342; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%rsi)
2343; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%rdx)
2344; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%rcx)
2345; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%r8)
2346; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%r9)
2347; AVX512DQ-BW-NEXT:    vzeroupper
2348; AVX512DQ-BW-NEXT:    retq
2349;
2350; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf16:
2351; AVX512DQ-BW-FCP:       # %bb.0:
2352; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
2353; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
2354; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
2355; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2356; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27]
2357; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
2358; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
2359; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
2360; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2361; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28]
2362; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm5
2363; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
2364; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
2365; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29]
2366; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm6
2367; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
2368; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
2369; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30]
2370; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm7
2371; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
2372; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
2373; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31]
2374; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm0
2375; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rsi)
2376; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
2377; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
2378; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
2379; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%r9)
2380; AVX512DQ-BW-FCP-NEXT:    vzeroupper
2381; AVX512DQ-BW-FCP-NEXT:    retq
2382  %wide.vec = load <80 x i16>, ptr %in.vec, align 64
2383  %strided.vec0 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
2384  %strided.vec1 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76>
2385  %strided.vec2 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77>
2386  %strided.vec3 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78>
2387  %strided.vec4 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79>
2388  store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
2389  store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
2390  store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
2391  store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
2392  store <16 x i16> %strided.vec4, ptr %out.vec4, align 64
2393  ret void
2394}
2395
2396define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
2397; SSE-LABEL: load_i16_stride5_vf32:
2398; SSE:       # %bb.0:
2399; SSE-NEXT:    subq $408, %rsp # imm = 0x198
2400; SSE-NEXT:    movdqa 64(%rdi), %xmm4
2401; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2402; SSE-NEXT:    movdqa (%rdi), %xmm6
2403; SSE-NEXT:    movdqa 16(%rdi), %xmm13
2404; SSE-NEXT:    movdqa 32(%rdi), %xmm9
2405; SSE-NEXT:    movdqa 48(%rdi), %xmm5
2406; SSE-NEXT:    movdqa 224(%rdi), %xmm7
2407; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2408; SSE-NEXT:    movdqa 160(%rdi), %xmm11
2409; SSE-NEXT:    movdqa 176(%rdi), %xmm12
2410; SSE-NEXT:    movdqa 208(%rdi), %xmm8
2411; SSE-NEXT:    movdqa 192(%rdi), %xmm2
2412; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2413; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
2414; SSE-NEXT:    movdqa %xmm0, %xmm1
2415; SSE-NEXT:    pandn %xmm2, %xmm1
2416; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3]
2417; SSE-NEXT:    pand %xmm0, %xmm2
2418; SSE-NEXT:    por %xmm1, %xmm2
2419; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
2420; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2421; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2422; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
2423; SSE-NEXT:    movdqa %xmm11, (%rsp) # 16-byte Spill
2424; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
2425; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2426; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
2427; SSE-NEXT:    movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0]
2428; SSE-NEXT:    andps %xmm15, %xmm3
2429; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
2430; SSE-NEXT:    movaps %xmm15, %xmm2
2431; SSE-NEXT:    pandn %xmm1, %xmm2
2432; SSE-NEXT:    por %xmm3, %xmm2
2433; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2434; SSE-NEXT:    movdqa %xmm0, %xmm1
2435; SSE-NEXT:    pandn %xmm9, %xmm1
2436; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2437; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
2438; SSE-NEXT:    movdqa %xmm5, %xmm7
2439; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2440; SSE-NEXT:    pand %xmm0, %xmm2
2441; SSE-NEXT:    por %xmm1, %xmm2
2442; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
2443; SSE-NEXT:    movdqa %xmm13, %xmm5
2444; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2445; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2446; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
2447; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2448; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
2449; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2450; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
2451; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
2452; SSE-NEXT:    movaps %xmm15, %xmm2
2453; SSE-NEXT:    andnps %xmm1, %xmm2
2454; SSE-NEXT:    movdqa 272(%rdi), %xmm4
2455; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2456; SSE-NEXT:    andps %xmm15, %xmm3
2457; SSE-NEXT:    orps %xmm3, %xmm2
2458; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2459; SSE-NEXT:    movdqa %xmm0, %xmm1
2460; SSE-NEXT:    pandn %xmm4, %xmm1
2461; SSE-NEXT:    movdqa 288(%rdi), %xmm2
2462; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2463; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
2464; SSE-NEXT:    pand %xmm0, %xmm2
2465; SSE-NEXT:    por %xmm1, %xmm2
2466; SSE-NEXT:    movdqa 256(%rdi), %xmm14
2467; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[3,1,2,3]
2468; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2469; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2470; SSE-NEXT:    movdqa 240(%rdi), %xmm13
2471; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3]
2472; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2473; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
2474; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2475; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
2476; SSE-NEXT:    movdqa 304(%rdi), %xmm1
2477; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2478; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
2479; SSE-NEXT:    movaps %xmm15, %xmm2
2480; SSE-NEXT:    andnps %xmm1, %xmm2
2481; SSE-NEXT:    andps %xmm15, %xmm3
2482; SSE-NEXT:    orps %xmm3, %xmm2
2483; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2484; SSE-NEXT:    movdqa 128(%rdi), %xmm1
2485; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2486; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2487; SSE-NEXT:    pand %xmm0, %xmm1
2488; SSE-NEXT:    movdqa 112(%rdi), %xmm2
2489; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2490; SSE-NEXT:    pandn %xmm2, %xmm0
2491; SSE-NEXT:    por %xmm1, %xmm0
2492; SSE-NEXT:    movdqa 96(%rdi), %xmm1
2493; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2494; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
2495; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2496; SSE-NEXT:    movdqa 80(%rdi), %xmm4
2497; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2498; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2499; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
2500; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2501; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
2502; SSE-NEXT:    movdqa 144(%rdi), %xmm0
2503; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2504; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2505; SSE-NEXT:    movaps %xmm15, %xmm1
2506; SSE-NEXT:    andnps %xmm0, %xmm1
2507; SSE-NEXT:    andps %xmm15, %xmm2
2508; SSE-NEXT:    orps %xmm2, %xmm1
2509; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2510; SSE-NEXT:    psrlq $48, %xmm12
2511; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3]
2512; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
2513; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
2514; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535]
2515; SSE-NEXT:    movdqa %xmm0, %xmm2
2516; SSE-NEXT:    pandn %xmm1, %xmm2
2517; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2518; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3]
2519; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2520; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
2521; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2522; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
2523; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
2524; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
2525; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
2526; SSE-NEXT:    pand %xmm0, %xmm1
2527; SSE-NEXT:    por %xmm2, %xmm1
2528; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2529; SSE-NEXT:    movdqa %xmm10, %xmm2
2530; SSE-NEXT:    psllq $48, %xmm2
2531; SSE-NEXT:    movaps %xmm15, %xmm3
2532; SSE-NEXT:    andnps %xmm2, %xmm3
2533; SSE-NEXT:    pand %xmm15, %xmm1
2534; SSE-NEXT:    orps %xmm1, %xmm3
2535; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2536; SSE-NEXT:    psrlq $48, %xmm5
2537; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3]
2538; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
2539; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
2540; SSE-NEXT:    movdqa %xmm0, %xmm1
2541; SSE-NEXT:    pandn %xmm2, %xmm1
2542; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3]
2543; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
2544; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2545; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
2546; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2547; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
2548; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
2549; SSE-NEXT:    pand %xmm0, %xmm2
2550; SSE-NEXT:    por %xmm1, %xmm2
2551; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2552; SSE-NEXT:    movdqa %xmm6, %xmm1
2553; SSE-NEXT:    psllq $48, %xmm1
2554; SSE-NEXT:    movdqa %xmm15, %xmm3
2555; SSE-NEXT:    pandn %xmm1, %xmm3
2556; SSE-NEXT:    pand %xmm15, %xmm2
2557; SSE-NEXT:    por %xmm2, %xmm3
2558; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2559; SSE-NEXT:    movdqa %xmm14, %xmm1
2560; SSE-NEXT:    psrlq $48, %xmm1
2561; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3]
2562; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
2563; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2564; SSE-NEXT:    movdqa %xmm0, %xmm1
2565; SSE-NEXT:    pandn %xmm2, %xmm1
2566; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2567; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3]
2568; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2569; SSE-NEXT:    # xmm3 = mem[0,2,2,3]
2570; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2571; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
2572; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2573; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
2574; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
2575; SSE-NEXT:    pand %xmm0, %xmm2
2576; SSE-NEXT:    por %xmm1, %xmm2
2577; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2578; SSE-NEXT:    movdqa %xmm7, %xmm1
2579; SSE-NEXT:    psllq $48, %xmm1
2580; SSE-NEXT:    movdqa %xmm15, %xmm3
2581; SSE-NEXT:    pandn %xmm1, %xmm3
2582; SSE-NEXT:    pand %xmm15, %xmm2
2583; SSE-NEXT:    por %xmm2, %xmm3
2584; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2585; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2586; SSE-NEXT:    movdqa %xmm13, %xmm1
2587; SSE-NEXT:    psrlq $48, %xmm1
2588; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
2589; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
2590; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2591; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2592; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,3,2,3]
2593; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2594; SSE-NEXT:    # xmm3 = mem[0,2,2,3]
2595; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2596; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
2597; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
2598; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
2599; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
2600; SSE-NEXT:    pand %xmm0, %xmm1
2601; SSE-NEXT:    pandn %xmm2, %xmm0
2602; SSE-NEXT:    por %xmm1, %xmm0
2603; SSE-NEXT:    pand %xmm15, %xmm0
2604; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2605; SSE-NEXT:    movdqa %xmm5, %xmm1
2606; SSE-NEXT:    psllq $48, %xmm1
2607; SSE-NEXT:    pandn %xmm1, %xmm15
2608; SSE-NEXT:    por %xmm0, %xmm15
2609; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2610; SSE-NEXT:    movdqa %xmm8, %xmm0
2611; SSE-NEXT:    movdqa %xmm11, %xmm8
2612; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
2613; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3]
2614; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
2615; SSE-NEXT:    movaps {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535]
2616; SSE-NEXT:    movaps %xmm11, %xmm1
2617; SSE-NEXT:    andnps %xmm0, %xmm1
2618; SSE-NEXT:    movdqa (%rsp), %xmm4 # 16-byte Reload
2619; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3]
2620; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2621; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2622; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2623; SSE-NEXT:    pand %xmm11, %xmm2
2624; SSE-NEXT:    por %xmm1, %xmm2
2625; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2626; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0]
2627; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2628; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2629; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
2630; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2631; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2632; SSE-NEXT:    movaps %xmm1, %xmm0
2633; SSE-NEXT:    movaps %xmm1, %xmm15
2634; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2635; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0]
2636; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3]
2637; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
2638; SSE-NEXT:    movaps %xmm11, %xmm1
2639; SSE-NEXT:    andnps %xmm0, %xmm1
2640; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2641; SSE-NEXT:    # xmm2 = mem[0,1,1,3]
2642; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2643; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2644; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
2645; SSE-NEXT:    pand %xmm11, %xmm2
2646; SSE-NEXT:    por %xmm1, %xmm2
2647; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2648; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0]
2649; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2650; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2651; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
2652; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2653; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2654; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0]
2655; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3]
2656; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,0,1,3]
2657; SSE-NEXT:    movaps %xmm11, %xmm1
2658; SSE-NEXT:    andnps %xmm14, %xmm1
2659; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2660; SSE-NEXT:    # xmm2 = mem[0,1,1,3]
2661; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2662; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2663; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
2664; SSE-NEXT:    pand %xmm11, %xmm2
2665; SSE-NEXT:    por %xmm1, %xmm2
2666; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7]
2667; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0]
2668; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2669; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2670; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
2671; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2672; SSE-NEXT:    movdqa %xmm12, %xmm0
2673; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2674; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2675; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2676; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
2677; SSE-NEXT:    movaps %xmm11, %xmm1
2678; SSE-NEXT:    andnps %xmm0, %xmm1
2679; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2680; SSE-NEXT:    # xmm2 = mem[0,1,1,3]
2681; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2682; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
2683; SSE-NEXT:    pand %xmm11, %xmm2
2684; SSE-NEXT:    por %xmm1, %xmm2
2685; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2686; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0]
2687; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2688; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2689; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
2690; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2691; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7]
2692; SSE-NEXT:    movdqa %xmm11, %xmm1
2693; SSE-NEXT:    pandn %xmm0, %xmm1
2694; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2695; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
2696; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2697; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2698; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2699; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7]
2700; SSE-NEXT:    pand %xmm11, %xmm0
2701; SSE-NEXT:    por %xmm1, %xmm0
2702; SSE-NEXT:    movdqa %xmm0, %xmm2
2703; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2704; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[3,0]
2705; SSE-NEXT:    movaps %xmm11, %xmm0
2706; SSE-NEXT:    andnps %xmm8, %xmm0
2707; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2708; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,2]
2709; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7]
2710; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2711; SSE-NEXT:    # xmm1 = mem[0,1,0,3]
2712; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2713; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2714; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
2715; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2716; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7]
2717; SSE-NEXT:    movdqa %xmm11, %xmm1
2718; SSE-NEXT:    pandn %xmm0, %xmm1
2719; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2720; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2721; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2722; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3]
2723; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2724; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2725; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2726; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm0[1,0,3,3,4,5,6,7]
2727; SSE-NEXT:    pand %xmm11, %xmm13
2728; SSE-NEXT:    por %xmm1, %xmm13
2729; SSE-NEXT:    movaps %xmm15, %xmm0
2730; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0]
2731; SSE-NEXT:    movdqa %xmm11, %xmm12
2732; SSE-NEXT:    pandn %xmm9, %xmm12
2733; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2]
2734; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7]
2735; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2736; SSE-NEXT:    # xmm1 = mem[0,1,0,3]
2737; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2738; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2739; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0]
2740; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7]
2741; SSE-NEXT:    movdqa %xmm11, %xmm1
2742; SSE-NEXT:    pandn %xmm0, %xmm1
2743; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2744; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2745; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2746; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
2747; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2748; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2749; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2750; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7]
2751; SSE-NEXT:    pand %xmm11, %xmm9
2752; SSE-NEXT:    por %xmm1, %xmm9
2753; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2754; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0]
2755; SSE-NEXT:    movdqa %xmm11, %xmm15
2756; SSE-NEXT:    pandn %xmm10, %xmm15
2757; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2]
2758; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7]
2759; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3]
2760; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2761; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2762; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0]
2763; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2764; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7]
2765; SSE-NEXT:    movdqa %xmm11, %xmm1
2766; SSE-NEXT:    pandn %xmm0, %xmm1
2767; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2768; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2769; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2770; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2771; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2772; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2773; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2774; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm0[1,0,3,3,4,5,6,7]
2775; SSE-NEXT:    pand %xmm11, %xmm10
2776; SSE-NEXT:    por %xmm1, %xmm10
2777; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2778; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
2779; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
2780; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2781; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm14[3,0]
2782; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,2]
2783; SSE-NEXT:    movaps %xmm14, %xmm2
2784; SSE-NEXT:    movdqa %xmm3, %xmm1
2785; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[3,0]
2786; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,2]
2787; SSE-NEXT:    movaps %xmm1, %xmm14
2788; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
2789; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
2790; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2791; SSE-NEXT:    movaps %xmm8, %xmm1
2792; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0]
2793; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2794; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2795; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7]
2796; SSE-NEXT:    pand %xmm11, %xmm3
2797; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7]
2798; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2799; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7]
2800; SSE-NEXT:    pand %xmm11, %xmm2
2801; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7]
2802; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2803; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
2804; SSE-NEXT:    pand %xmm11, %xmm0
2805; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
2806; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2807; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7]
2808; SSE-NEXT:    pand %xmm11, %xmm4
2809; SSE-NEXT:    pandn %xmm7, %xmm11
2810; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2]
2811; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7]
2812; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2813; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[0,1,0,3]
2814; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6]
2815; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[2,3]
2816; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,0]
2817; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2818; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2819; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
2820; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
2821; SSE-NEXT:    # xmm14 = mem[0,1,1,3]
2822; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2823; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7]
2824; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[1,3]
2825; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0]
2826; SSE-NEXT:    por %xmm12, %xmm2
2827; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2828; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
2829; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2830; SSE-NEXT:    # xmm12 = mem[0,1,1,3]
2831; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2832; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
2833; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3]
2834; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,0]
2835; SSE-NEXT:    por %xmm15, %xmm0
2836; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2837; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
2838; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2839; SSE-NEXT:    # xmm5 = mem[0,1,1,3]
2840; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2841; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2842; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3]
2843; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0]
2844; SSE-NEXT:    por %xmm4, %xmm11
2845; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3]
2846; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,1,1,3]
2847; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2848; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
2849; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[1,3]
2850; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0]
2851; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2852; SSE-NEXT:    movaps %xmm1, 16(%rsi)
2853; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2854; SSE-NEXT:    movaps %xmm1, 48(%rsi)
2855; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2856; SSE-NEXT:    movaps %xmm1, (%rsi)
2857; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2858; SSE-NEXT:    movaps %xmm1, 32(%rsi)
2859; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2860; SSE-NEXT:    movaps %xmm1, 16(%rdx)
2861; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2862; SSE-NEXT:    movaps %xmm1, 48(%rdx)
2863; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2864; SSE-NEXT:    movaps %xmm1, (%rdx)
2865; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2866; SSE-NEXT:    movaps %xmm1, 32(%rdx)
2867; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2868; SSE-NEXT:    movaps %xmm1, 16(%rcx)
2869; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2870; SSE-NEXT:    movaps %xmm1, 48(%rcx)
2871; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2872; SSE-NEXT:    movaps %xmm1, (%rcx)
2873; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2874; SSE-NEXT:    movaps %xmm1, 32(%rcx)
2875; SSE-NEXT:    movaps %xmm10, 16(%r8)
2876; SSE-NEXT:    movaps %xmm9, 48(%r8)
2877; SSE-NEXT:    movaps %xmm13, (%r8)
2878; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2879; SSE-NEXT:    movaps %xmm1, 32(%r8)
2880; SSE-NEXT:    movaps %xmm11, 16(%r9)
2881; SSE-NEXT:    movaps %xmm0, 48(%r9)
2882; SSE-NEXT:    movaps %xmm2, (%r9)
2883; SSE-NEXT:    movaps %xmm3, 32(%r9)
2884; SSE-NEXT:    addq $408, %rsp # imm = 0x198
2885; SSE-NEXT:    retq
2886;
2887; AVX-LABEL: load_i16_stride5_vf32:
2888; AVX:       # %bb.0:
2889; AVX-NEXT:    subq $424, %rsp # imm = 0x1A8
2890; AVX-NEXT:    vmovdqa 144(%rdi), %xmm9
2891; AVX-NEXT:    vmovdqa 128(%rdi), %xmm7
2892; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7]
2893; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2894; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
2895; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2896; AVX-NEXT:    vmovdqa 96(%rdi), %xmm11
2897; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[0,1,1,3]
2898; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2899; AVX-NEXT:    vmovdqa 112(%rdi), %xmm10
2900; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1]
2901; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2902; AVX-NEXT:    vmovdqa 80(%rdi), %xmm3
2903; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2904; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2905; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
2906; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
2907; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
2908; AVX-NEXT:    vmovdqa (%rdi), %xmm5
2909; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2910; AVX-NEXT:    vmovdqa 16(%rdi), %xmm12
2911; AVX-NEXT:    vmovdqa 32(%rdi), %xmm3
2912; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2913; AVX-NEXT:    vmovdqa 48(%rdi), %xmm15
2914; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,1,0,3]
2915; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
2916; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3]
2917; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2918; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
2919; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2920; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
2921; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2922; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7]
2923; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
2924; AVX-NEXT:    vandps %ymm6, %ymm3, %ymm3
2925; AVX-NEXT:    vmovaps 64(%rdi), %xmm5
2926; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[0,1,0,1]
2927; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2928; AVX-NEXT:    vandnps %ymm4, %ymm6, %ymm4
2929; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
2930; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
2931; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2932; AVX-NEXT:    vmovdqa 304(%rdi), %xmm2
2933; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2934; AVX-NEXT:    vmovdqa 288(%rdi), %xmm13
2935; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3],xmm13[4,5,6,7]
2936; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2937; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
2938; AVX-NEXT:    vmovdqa 256(%rdi), %xmm0
2939; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2940; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2941; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2942; AVX-NEXT:    vmovdqa 272(%rdi), %xmm0
2943; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2944; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2945; AVX-NEXT:    vmovdqa 240(%rdi), %xmm0
2946; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2947; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
2948; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
2949; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
2950; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4],xmm1[5,6,7]
2951; AVX-NEXT:    vmovdqa 176(%rdi), %xmm0
2952; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2953; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
2954; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2955; AVX-NEXT:    vmovdqa 160(%rdi), %xmm0
2956; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2957; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
2958; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
2959; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2960; AVX-NEXT:    vmovdqa 208(%rdi), %xmm0
2961; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2962; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2963; AVX-NEXT:    vmovdqa 192(%rdi), %xmm14
2964; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7]
2965; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2966; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2967; AVX-NEXT:    vandps %ymm6, %ymm0, %ymm0
2968; AVX-NEXT:    vmovaps 224(%rdi), %xmm1
2969; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2970; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1]
2971; AVX-NEXT:    vandnps %ymm8, %ymm6, %ymm8
2972; AVX-NEXT:    vorps %ymm0, %ymm8, %ymm0
2973; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
2974; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2975; AVX-NEXT:    vmovdqa %xmm11, %xmm6
2976; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2977; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
2978; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
2979; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
2980; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2981; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3]
2982; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
2983; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
2984; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2985; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7]
2986; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
2987; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
2988; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
2989; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2990; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3]
2991; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
2992; AVX-NEXT:    vpsrlq $48, %xmm12, %xmm9
2993; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
2994; AVX-NEXT:    vmovdqa %xmm15, %xmm12
2995; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2996; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2997; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7]
2998; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
2999; AVX-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
3000; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7]
3001; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
3002; AVX-NEXT:    vandps %ymm3, %ymm10, %ymm3
3003; AVX-NEXT:    vpsllq $48, %xmm5, %xmm9
3004; AVX-NEXT:    vandnps %ymm9, %ymm10, %ymm9
3005; AVX-NEXT:    vorps %ymm3, %ymm9, %ymm3
3006; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
3007; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3008; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3009; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7]
3010; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
3011; AVX-NEXT:    vmovdqa (%rsp), %xmm15 # 16-byte Reload
3012; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload
3013; AVX-NEXT:    # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
3014; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7]
3015; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3016; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3017; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3]
3018; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
3019; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7]
3020; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
3021; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3022; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7]
3023; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
3024; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3025; AVX-NEXT:    # xmm3 = mem[0,3,2,3]
3026; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
3027; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3028; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm8
3029; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
3030; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7]
3031; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
3032; AVX-NEXT:    vandps %ymm1, %ymm8, %ymm1
3033; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3034; AVX-NEXT:    vpsllq $48, %xmm14, %xmm3
3035; AVX-NEXT:    vandnps %ymm3, %ymm8, %ymm3
3036; AVX-NEXT:    vorps %ymm3, %ymm1, %ymm1
3037; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3038; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3039; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload
3040; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
3041; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13]
3042; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
3043; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
3044; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
3045; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
3046; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
3047; AVX-NEXT:    # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7]
3048; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
3049; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
3050; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
3051; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7]
3052; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
3053; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
3054; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3]
3055; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
3056; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3057; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
3058; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7]
3059; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3060; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0]
3061; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5]
3062; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
3063; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
3064; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3065; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3066; AVX-NEXT:    vmovdqa %xmm5, %xmm9
3067; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
3068; AVX-NEXT:    vpshufb %xmm7, %xmm0, %xmm4
3069; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3070; AVX-NEXT:    vmovdqa %xmm15, %xmm0
3071; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7]
3072; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
3073; AVX-NEXT:    vmovdqa %xmm10, %xmm2
3074; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3]
3075; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
3076; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7]
3077; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7]
3078; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3079; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7]
3080; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
3081; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3082; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3]
3083; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
3084; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3085; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
3086; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7]
3087; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0]
3088; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
3089; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
3090; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
3091; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3092; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7]
3093; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
3094; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
3095; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm3
3096; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
3097; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7]
3098; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
3099; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
3100; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
3101; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7]
3102; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
3103; AVX-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
3104; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7]
3105; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
3106; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7]
3107; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5],xmm3[6,7]
3108; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[0,1,0,3]
3109; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
3110; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
3111; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
3112; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3113; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3114; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3115; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7]
3116; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
3117; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3118; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3119; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
3120; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
3121; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3122; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm3
3123; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
3124; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7]
3125; AVX-NEXT:    vmovdqa %xmm11, %xmm0
3126; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3127; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
3128; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
3129; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3130; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3131; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7]
3132; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
3133; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7]
3134; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7]
3135; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3]
3136; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
3137; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
3138; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm7
3139; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]
3140; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
3141; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
3142; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
3143; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3144; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3]
3145; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
3146; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
3147; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
3148; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
3149; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7]
3150; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1]
3151; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3]
3152; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7]
3153; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
3154; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
3155; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
3156; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7]
3157; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3]
3158; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
3159; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
3160; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
3161; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3162; AVX-NEXT:    # xmm3 = mem[3,1,2,3]
3163; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
3164; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3165; AVX-NEXT:    # xmm5 = mem[0,2,2,3]
3166; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
3167; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3168; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3169; AVX-NEXT:    # xmm5 = mem[0,3,2,3]
3170; AVX-NEXT:    vpblendw $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
3171; AVX-NEXT:    # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
3172; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3173; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
3174; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7]
3175; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
3176; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3177; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
3178; AVX-NEXT:    # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
3179; AVX-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
3180; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1]
3181; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
3182; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
3183; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
3184; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7]
3185; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3]
3186; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
3187; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
3188; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
3189; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3190; AVX-NEXT:    vmovaps %ymm3, 32(%rsi)
3191; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3192; AVX-NEXT:    vmovaps %ymm3, (%rsi)
3193; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3194; AVX-NEXT:    vmovaps %ymm3, 32(%rdx)
3195; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3196; AVX-NEXT:    vmovaps %ymm3, (%rdx)
3197; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3198; AVX-NEXT:    vmovaps %ymm0, 32(%rcx)
3199; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3200; AVX-NEXT:    vmovaps %ymm0, (%rcx)
3201; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3202; AVX-NEXT:    vmovaps %ymm0, 32(%r8)
3203; AVX-NEXT:    vmovaps %ymm7, (%r8)
3204; AVX-NEXT:    vmovaps %ymm2, 32(%r9)
3205; AVX-NEXT:    vmovaps %ymm1, (%r9)
3206; AVX-NEXT:    addq $424, %rsp # imm = 0x1A8
3207; AVX-NEXT:    vzeroupper
3208; AVX-NEXT:    retq
3209;
3210; AVX2-LABEL: load_i16_stride5_vf32:
3211; AVX2:       # %bb.0:
3212; AVX2-NEXT:    subq $264, %rsp # imm = 0x108
3213; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
3214; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
3215; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm3
3216; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm15
3217; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm4
3218; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm5
3219; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
3220; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm6
3221; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
3222; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
3223; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7]
3224; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
3225; AVX2-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
3226; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
3227; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3228; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
3229; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7]
3230; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
3231; AVX2-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
3232; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0]
3233; AVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm0, %ymm8
3234; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4],ymm3[5],ymm15[6,7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12],ymm3[13],ymm15[14,15]
3235; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1]
3236; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7]
3237; AVX2-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
3238; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
3239; AVX2-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
3240; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm12
3241; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7]
3242; AVX2-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
3243; AVX2-NEXT:    vpblendvb %ymm9, %ymm10, %ymm0, %ymm12
3244; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
3245; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
3246; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7]
3247; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
3248; AVX2-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
3249; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
3250; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm13
3251; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
3252; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
3253; AVX2-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
3254; AVX2-NEXT:    vpblendvb %ymm9, %ymm11, %ymm0, %ymm0
3255; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
3256; AVX2-NEXT:    vmovdqa %ymm15, %ymm5
3257; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
3258; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
3259; AVX2-NEXT:    vpshufb %ymm10, %ymm11, %ymm10
3260; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
3261; AVX2-NEXT:    vmovdqa %ymm2, %ymm15
3262; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm13
3263; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
3264; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm13
3265; AVX2-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
3266; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm14
3267; AVX2-NEXT:    vpblendvb %ymm9, %ymm11, %ymm10, %ymm9
3268; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3]
3269; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
3270; AVX2-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
3271; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
3272; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
3273; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7]
3274; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3275; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm11
3276; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm10
3277; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3]
3278; AVX2-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
3279; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3280; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15]
3281; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
3282; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3283; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3]
3284; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
3285; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
3286; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3287; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
3288; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3289; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3290; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3]
3291; AVX2-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
3292; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
3293; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15]
3294; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
3295; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3296; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
3297; AVX2-NEXT:    vmovdqa %ymm6, %ymm9
3298; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3299; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3300; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3301; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
3302; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3303; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15]
3304; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
3305; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
3306; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
3307; AVX2-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
3308; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
3309; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
3310; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3]
3311; AVX2-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
3312; AVX2-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
3313; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3314; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
3315; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3316; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3317; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15]
3318; AVX2-NEXT:    vmovdqa %ymm5, %ymm6
3319; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3320; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3321; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
3322; AVX2-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
3323; AVX2-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
3324; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15]
3325; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
3326; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
3327; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
3328; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
3329; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3]
3330; AVX2-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
3331; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3332; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
3333; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3334; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3335; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
3336; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3337; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
3338; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
3339; AVX2-NEXT:    vmovdqa %ymm4, %ymm7
3340; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
3341; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3]
3342; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
3343; AVX2-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
3344; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
3345; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
3346; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
3347; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3]
3348; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
3349; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
3350; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3351; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
3352; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3353; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3354; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15]
3355; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
3356; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7]
3357; AVX2-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
3358; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15]
3359; AVX2-NEXT:    vmovdqa %ymm5, %ymm1
3360; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm12
3361; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
3362; AVX2-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
3363; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7]
3364; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3]
3365; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
3366; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3367; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
3368; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3369; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3370; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
3371; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15]
3372; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
3373; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7]
3374; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
3375; AVX2-NEXT:    # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
3376; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
3377; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
3378; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
3379; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
3380; AVX2-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
3381; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
3382; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
3383; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
3384; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3]
3385; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
3386; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3]
3387; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
3388; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
3389; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3390; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
3391; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3392; AVX2-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
3393; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
3394; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
3395; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
3396; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
3397; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
3398; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
3399; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
3400; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
3401; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3]
3402; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
3403; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3]
3404; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
3405; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3406; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3407; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
3408; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3409; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
3410; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3411; AVX2-NEXT:    vmovaps %ymm1, (%rsi)
3412; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3413; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
3414; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3415; AVX2-NEXT:    vmovaps %ymm1, (%rdx)
3416; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3417; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
3418; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3419; AVX2-NEXT:    vmovaps %ymm1, (%rcx)
3420; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3421; AVX2-NEXT:    vmovaps %ymm1, 32(%r8)
3422; AVX2-NEXT:    vmovdqa %ymm0, (%r8)
3423; AVX2-NEXT:    vmovdqa %ymm2, 32(%r9)
3424; AVX2-NEXT:    vmovdqa %ymm3, (%r9)
3425; AVX2-NEXT:    addq $264, %rsp # imm = 0x108
3426; AVX2-NEXT:    vzeroupper
3427; AVX2-NEXT:    retq
3428;
3429; AVX2-FP-LABEL: load_i16_stride5_vf32:
3430; AVX2-FP:       # %bb.0:
3431; AVX2-FP-NEXT:    subq $264, %rsp # imm = 0x108
3432; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm13
3433; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm6
3434; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm7
3435; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm10
3436; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm14
3437; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm3
3438; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm4
3439; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm5
3440; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
3441; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3442; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3443; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
3444; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
3445; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
3446; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10,11],ymm14[12],ymm3[13],ymm14[14],ymm3[15]
3447; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3448; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
3449; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7]
3450; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
3451; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
3452; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0]
3453; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm8, %ymm0, %ymm0
3454; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15]
3455; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1]
3456; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7]
3457; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
3458; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3459; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15]
3460; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm12
3461; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7]
3462; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
3463; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm8, %ymm1, %ymm12
3464; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
3465; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
3466; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
3467; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
3468; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
3469; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm3[1],ymm14[2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10],ymm3[11],ymm14[12,13],ymm3[14],ymm14[15]
3470; AVX2-FP-NEXT:    vmovdqa %ymm14, %ymm5
3471; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm14
3472; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7]
3473; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
3474; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
3475; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm9, %ymm1, %ymm1
3476; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15]
3477; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3478; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1]
3479; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
3480; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm9, %ymm15
3481; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15]
3482; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
3483; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
3484; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm8
3485; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm9, %xmm14
3486; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm9
3487; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm14, %ymm15, %ymm11
3488; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3]
3489; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
3490; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
3491; AVX2-FP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
3492; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15]
3493; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
3494; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3495; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm6
3496; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm15
3497; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
3498; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
3499; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3500; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15]
3501; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7]
3502; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3503; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3]
3504; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
3505; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
3506; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3507; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3508; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3509; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3510; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3]
3511; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
3512; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3513; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15]
3514; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7]
3515; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3516; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3517; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3518; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
3519; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3520; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
3521; AVX2-FP-NEXT:    vmovdqu %ymm5, (%rsp) # 32-byte Spill
3522; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3523; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15]
3524; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm11
3525; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
3526; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
3527; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
3528; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
3529; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
3530; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
3531; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3]
3532; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
3533; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
3534; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3535; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3536; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3537; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3538; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15]
3539; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm10
3540; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3541; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
3542; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
3543; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3544; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15]
3545; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm11
3546; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
3547; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
3548; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
3549; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3]
3550; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
3551; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3552; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3553; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3554; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3555; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
3556; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3557; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
3558; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15]
3559; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm11
3560; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3]
3561; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
3562; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
3563; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
3564; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
3565; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
3566; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3]
3567; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
3568; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3569; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3570; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3571; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3572; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm5
3573; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3574; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15]
3575; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1]
3576; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7]
3577; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
3578; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm4
3579; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15]
3580; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm14
3581; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
3582; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
3583; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7]
3584; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3]
3585; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
3586; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
3587; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
3588; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
3589; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
3590; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm6, %xmm3
3591; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
3592; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm15, %xmm12
3593; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
3594; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
3595; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
3596; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7]
3597; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15]
3598; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
3599; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
3600; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
3601; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,1]
3602; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
3603; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
3604; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
3605; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
3606; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
3607; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
3608; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3609; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3610; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
3611; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3612; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7]
3613; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
3614; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
3615; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
3616; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
3617; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm6
3618; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
3619; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
3620; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
3621; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm8, %xmm5
3622; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
3623; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
3624; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
3625; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
3626; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3627; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
3628; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3629; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
3630; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3631; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
3632; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3633; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
3634; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3635; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
3636; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3637; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
3638; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%r8)
3639; AVX2-FP-NEXT:    vmovdqa %ymm1, (%r8)
3640; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%r9)
3641; AVX2-FP-NEXT:    vmovdqa %ymm3, (%r9)
3642; AVX2-FP-NEXT:    addq $264, %rsp # imm = 0x108
3643; AVX2-FP-NEXT:    vzeroupper
3644; AVX2-FP-NEXT:    retq
3645;
3646; AVX2-FCP-LABEL: load_i16_stride5_vf32:
3647; AVX2-FCP:       # %bb.0:
3648; AVX2-FCP-NEXT:    subq $296, %rsp # imm = 0x128
3649; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm15
3650; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm1
3651; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm3
3652; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm14
3653; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
3654; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
3655; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
3656; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
3657; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15]
3658; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3659; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3660; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3]
3661; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm8
3662; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
3663; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm8, %ymm8
3664; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
3665; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm12
3666; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7]
3667; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
3668; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm13
3669; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0]
3670; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm13, %ymm8, %ymm8
3671; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15]
3672; AVX2-FCP-NEXT:    vmovdqa %ymm14, %ymm0
3673; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3674; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
3675; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7]
3676; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm13, %xmm12
3677; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15]
3678; AVX2-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
3679; AVX2-FCP-NEXT:    vmovdqa %ymm15, %ymm2
3680; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm10, %ymm10
3681; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
3682; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm12, %ymm10, %ymm11
3683; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
3684; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,4,7,1,6]
3685; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm12, %ymm10
3686; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
3687; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm10
3688; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
3689; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm7
3690; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
3691; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7]
3692; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
3693; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
3694; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm13, %ymm10, %ymm13
3695; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15]
3696; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm0
3697; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7]
3698; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
3699; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
3700; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm15
3701; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm12, %ymm10
3702; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
3703; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm10
3704; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7]
3705; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm10, %ymm0
3706; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm14, %ymm9
3707; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
3708; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm9
3709; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
3710; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3711; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3712; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm10
3713; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm14, %ymm8
3714; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm8
3715; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
3716; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7]
3717; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3718; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7]
3719; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm8, %ymm11
3720; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
3721; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm11, %ymm11
3722; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15]
3723; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm11[4,5,6,7]
3724; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3725; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm8, %ymm8
3726; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
3727; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15]
3728; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
3729; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3730; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
3731; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm9
3732; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3733; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3734; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm8
3735; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7]
3736; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3737; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3738; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
3739; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,2,0,0,5,7,2,4]
3740; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm14, %ymm8
3741; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
3742; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
3743; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
3744; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
3745; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
3746; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0]
3747; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
3748; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm8, %ymm11
3749; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm11
3750; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15]
3751; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
3752; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3753; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm6
3754; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3755; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15]
3756; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm13
3757; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7]
3758; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
3759; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
3760; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
3761; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm14, %ymm11
3762; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
3763; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
3764; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm8, %ymm2
3765; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
3766; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3767; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3768; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3769; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
3770; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
3771; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
3772; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
3773; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm7
3774; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
3775; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm11, %ymm2
3776; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
3777; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
3778; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
3779; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
3780; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
3781; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0]
3782; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
3783; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm2, %ymm8
3784; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
3785; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm8
3786; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
3787; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
3788; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3789; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15]
3790; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm8
3791; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
3792; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
3793; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
3794; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
3795; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm9
3796; AVX2-FCP-NEXT:    vmovdqa %ymm15, %ymm14
3797; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm11, %ymm1
3798; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
3799; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
3800; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm2, %ymm1
3801; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
3802; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
3803; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3804; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15]
3805; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3806; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
3807; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
3808; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
3809; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
3810; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5]
3811; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
3812; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
3813; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
3814; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
3815; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
3816; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
3817; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
3818; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
3819; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm1, %ymm6
3820; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
3821; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
3822; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
3823; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
3824; AVX2-FCP-NEXT:    # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
3825; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
3826; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
3827; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
3828; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15]
3829; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm2, %ymm2
3830; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
3831; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
3832; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm1, %ymm1
3833; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
3834; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
3835; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3836; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rsi)
3837; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3838; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rsi)
3839; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3840; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rdx)
3841; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3842; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rdx)
3843; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3844; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rcx)
3845; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3846; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rcx)
3847; AVX2-FCP-NEXT:    vmovdqa %ymm15, 32(%r8)
3848; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3849; AVX2-FCP-NEXT:    vmovaps %ymm2, (%r8)
3850; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%r9)
3851; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r9)
3852; AVX2-FCP-NEXT:    addq $296, %rsp # imm = 0x128
3853; AVX2-FCP-NEXT:    vzeroupper
3854; AVX2-FCP-NEXT:    retq
3855;
3856; AVX512-LABEL: load_i16_stride5_vf32:
3857; AVX512:       # %bb.0:
3858; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm0
3859; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm1
3860; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
3861; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
3862; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
3863; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
3864; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm4
3865; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm3
3866; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm9
3867; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15]
3868; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
3869; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7]
3870; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
3871; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
3872; AVX512-NEXT:    vmovdqa64 176(%rdi), %xmm20
3873; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
3874; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
3875; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm6
3876; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3]
3877; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
3878; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
3879; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
3880; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7]
3881; AVX512-NEXT:    vmovdqa (%rdi), %ymm8
3882; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm10
3883; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm4
3884; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm5
3885; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
3886; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3887; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7]
3888; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
3889; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
3890; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm13
3891; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
3892; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
3893; AVX512-NEXT:    vpor %ymm11, %ymm12, %ymm15
3894; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm11
3895; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm12
3896; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3]
3897; AVX512-NEXT:    vpshufb %xmm7, %xmm13, %xmm7
3898; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3899; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
3900; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15))
3901; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm7, %zmm16
3902; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15]
3903; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm14
3904; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7]
3905; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
3906; AVX512-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
3907; AVX512-NEXT:    vpsrlq $48, %xmm20, %xmm15
3908; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3]
3909; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
3910; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
3911; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7]
3912; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
3913; AVX512-NEXT:    vmovdqa %ymm0, %ymm2
3914; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
3915; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm15
3916; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
3917; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
3918; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
3919; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7]
3920; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
3921; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
3922; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
3923; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15]
3924; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm0
3925; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7]
3926; AVX512-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
3927; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u]
3928; AVX512-NEXT:    vpor %ymm0, %ymm13, %ymm0
3929; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
3930; AVX512-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
3931; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
3932; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0))
3933; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm19
3934; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
3935; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
3936; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7]
3937; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
3938; AVX512-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
3939; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm21
3940; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3]
3941; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
3942; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm15
3943; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3]
3944; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
3945; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
3946; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
3947; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm13
3948; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7]
3949; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
3950; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3951; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
3952; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15]
3953; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm20
3954; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm13
3955; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7]
3956; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
3957; AVX512-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
3958; AVX512-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
3959; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm22
3960; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
3961; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7]
3962; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
3963; AVX512-NEXT:    vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3]
3964; AVX512-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
3965; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
3966; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm13, %zmm0
3967; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15]
3968; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm14
3969; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3]
3970; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
3971; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
3972; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
3973; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
3974; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
3975; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
3976; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
3977; AVX512-NEXT:    vmovdqa %ymm2, %ymm9
3978; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
3979; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
3980; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0))
3981; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
3982; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
3983; AVX512-NEXT:    vextracti64x4 $1, %zmm13, %ymm14
3984; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15]
3985; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
3986; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm13, %zmm17
3987; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
3988; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
3989; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3]
3990; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
3991; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
3992; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15]
3993; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm14
3994; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7]
3995; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
3996; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
3997; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7]
3998; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
3999; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
4000; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
4001; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
4002; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm2
4003; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
4004; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
4005; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13))
4006; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
4007; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm2
4008; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
4009; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
4010; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
4011; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm3
4012; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3]
4013; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
4014; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
4015; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
4016; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
4017; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4018; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
4019; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
4020; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
4021; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
4022; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
4023; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
4024; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
4025; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
4026; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
4027; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
4028; AVX512-NEXT:    movb $7, %al
4029; AVX512-NEXT:    kmovw %eax, %k1
4030; AVX512-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1}
4031; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15]
4032; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
4033; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
4034; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
4035; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
4036; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4037; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4038; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4039; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4040; AVX512-NEXT:    vmovdqa64 %zmm16, (%rsi)
4041; AVX512-NEXT:    vmovdqa64 %zmm19, (%rdx)
4042; AVX512-NEXT:    vmovdqa64 %zmm7, (%rcx)
4043; AVX512-NEXT:    vmovdqa64 %zmm17, (%r8)
4044; AVX512-NEXT:    vmovdqa64 %zmm0, (%r9)
4045; AVX512-NEXT:    vzeroupper
4046; AVX512-NEXT:    retq
4047;
4048; AVX512-FCP-LABEL: load_i16_stride5_vf32:
4049; AVX512-FCP:       # %bb.0:
4050; AVX512-FCP-NEXT:    vmovdqa 176(%rdi), %xmm2
4051; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
4052; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %xmm3
4053; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
4054; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4055; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm4
4056; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
4057; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
4058; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0]
4059; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
4060; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
4061; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0]
4062; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
4063; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
4064; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm1
4065; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
4066; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
4067; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
4068; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
4069; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4070; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
4071; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm10
4072; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
4073; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
4074; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
4075; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
4076; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3]
4077; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm7
4078; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
4079; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
4080; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
4081; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
4082; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
4083; AVX512-FCP-NEXT:    vpor %ymm7, %ymm12, %ymm12
4084; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0]
4085; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
4086; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm13
4087; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
4088; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
4089; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4090; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12))
4091; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm13, %zmm16
4092; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
4093; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
4094; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7]
4095; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
4096; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm12
4097; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
4098; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
4099; AVX512-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
4100; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
4101; AVX512-FCP-NEXT:    vpor %ymm13, %ymm12, %ymm12
4102; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0]
4103; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm15
4104; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
4105; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm15
4106; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12))
4107; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
4108; AVX512-FCP-NEXT:    vpsrlq $48, %xmm2, %xmm12
4109; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
4110; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
4111; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0]
4112; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
4113; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
4114; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7]
4115; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
4116; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
4117; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3]
4118; AVX512-FCP-NEXT:    # ymm17 = mem[0,1,2,3,0,1,2,3]
4119; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
4120; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
4121; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7]
4122; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm15, %zmm17
4123; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
4124; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
4125; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7]
4126; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
4127; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4128; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
4129; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
4130; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
4131; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0]
4132; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm19, %ymm12
4133; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
4134; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7]
4135; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
4136; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7]
4137; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
4138; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
4139; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7]
4140; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
4141; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
4142; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4]
4143; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm19, %ymm15
4144; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
4145; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
4146; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0]
4147; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
4148; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
4149; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
4150; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12))
4151; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm14
4152; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
4153; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
4154; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3]
4155; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
4156; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
4157; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7]
4158; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm15, %ymm12
4159; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
4160; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
4161; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3]
4162; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
4163; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
4164; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
4165; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm18, %ymm15
4166; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
4167; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7]
4168; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
4169; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0]
4170; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
4171; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
4172; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
4173; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
4174; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6))
4175; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm6
4176; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
4177; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
4178; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7]
4179; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
4180; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4181; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
4182; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
4183; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm12, %zmm6
4184; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
4185; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
4186; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
4187; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
4188; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
4189; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5]
4190; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
4191; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
4192; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
4193; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
4194; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
4195; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
4196; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0]
4197; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
4198; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
4199; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
4200; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4201; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7]
4202; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm3, %ymm3
4203; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
4204; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
4205; AVX512-FCP-NEXT:    movb $7, %al
4206; AVX512-FCP-NEXT:    kmovw %eax, %k1
4207; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
4208; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
4209; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
4210; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
4211; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
4212; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
4213; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4214; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
4215; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4216; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
4217; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rsi)
4218; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
4219; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rcx)
4220; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%r8)
4221; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
4222; AVX512-FCP-NEXT:    vzeroupper
4223; AVX512-FCP-NEXT:    retq
4224;
4225; AVX512DQ-LABEL: load_i16_stride5_vf32:
4226; AVX512DQ:       # %bb.0:
4227; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm0
4228; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm1
4229; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
4230; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
4231; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
4232; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
4233; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm4
4234; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm3
4235; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm9
4236; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15]
4237; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
4238; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7]
4239; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
4240; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
4241; AVX512DQ-NEXT:    vmovdqa64 176(%rdi), %xmm20
4242; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
4243; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
4244; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm6
4245; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3]
4246; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
4247; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
4248; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
4249; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7]
4250; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm8
4251; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm10
4252; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
4253; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm5
4254; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
4255; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
4256; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7]
4257; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
4258; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
4259; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm13
4260; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
4261; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
4262; AVX512DQ-NEXT:    vpor %ymm11, %ymm12, %ymm15
4263; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm11
4264; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm12
4265; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3]
4266; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm13, %xmm7
4267; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4268; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4269; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15))
4270; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm7, %zmm16
4271; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15]
4272; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm14
4273; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7]
4274; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
4275; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
4276; AVX512DQ-NEXT:    vpsrlq $48, %xmm20, %xmm15
4277; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3]
4278; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
4279; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
4280; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7]
4281; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
4282; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm2
4283; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
4284; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm15
4285; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
4286; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
4287; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4288; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7]
4289; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
4290; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
4291; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
4292; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15]
4293; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm0
4294; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7]
4295; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
4296; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u]
4297; AVX512DQ-NEXT:    vpor %ymm0, %ymm13, %ymm0
4298; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
4299; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
4300; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4301; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0))
4302; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm19
4303; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
4304; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm7
4305; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7]
4306; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
4307; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
4308; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm21
4309; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3]
4310; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
4311; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm15
4312; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3]
4313; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
4314; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
4315; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
4316; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm13
4317; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7]
4318; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
4319; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4320; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
4321; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15]
4322; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm20
4323; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm13
4324; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7]
4325; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
4326; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
4327; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
4328; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm22
4329; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
4330; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7]
4331; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
4332; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3]
4333; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
4334; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4335; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm13, %zmm0
4336; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15]
4337; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
4338; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3]
4339; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
4340; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
4341; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
4342; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
4343; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
4344; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
4345; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
4346; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm9
4347; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
4348; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
4349; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0))
4350; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
4351; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4352; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm14
4353; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15]
4354; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
4355; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm13, %zmm17
4356; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
4357; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
4358; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3]
4359; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
4360; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
4361; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15]
4362; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
4363; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7]
4364; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
4365; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
4366; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7]
4367; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
4368; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
4369; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
4370; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
4371; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm2
4372; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
4373; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
4374; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13))
4375; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
4376; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm2
4377; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
4378; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
4379; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
4380; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm3
4381; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3]
4382; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
4383; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
4384; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
4385; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
4386; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4387; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
4388; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
4389; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
4390; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
4391; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
4392; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
4393; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
4394; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
4395; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
4396; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
4397; AVX512DQ-NEXT:    movb $7, %al
4398; AVX512DQ-NEXT:    kmovw %eax, %k1
4399; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1}
4400; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15]
4401; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
4402; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
4403; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
4404; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
4405; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4406; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4407; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4408; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4409; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%rsi)
4410; AVX512DQ-NEXT:    vmovdqa64 %zmm19, (%rdx)
4411; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rcx)
4412; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%r8)
4413; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%r9)
4414; AVX512DQ-NEXT:    vzeroupper
4415; AVX512DQ-NEXT:    retq
4416;
4417; AVX512DQ-FCP-LABEL: load_i16_stride5_vf32:
4418; AVX512DQ-FCP:       # %bb.0:
4419; AVX512DQ-FCP-NEXT:    vmovdqa 176(%rdi), %xmm2
4420; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
4421; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %xmm3
4422; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
4423; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4424; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm4
4425; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
4426; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
4427; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0]
4428; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
4429; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
4430; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0]
4431; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
4432; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
4433; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm1
4434; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
4435; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
4436; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
4437; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
4438; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4439; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
4440; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm10
4441; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
4442; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
4443; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
4444; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
4445; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3]
4446; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm7
4447; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
4448; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
4449; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
4450; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
4451; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
4452; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm12, %ymm12
4453; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0]
4454; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
4455; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm13
4456; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
4457; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
4458; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4459; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12))
4460; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm13, %zmm16
4461; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
4462; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
4463; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7]
4464; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
4465; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm12
4466; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
4467; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
4468; AVX512DQ-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
4469; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
4470; AVX512DQ-FCP-NEXT:    vpor %ymm13, %ymm12, %ymm12
4471; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0]
4472; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm15
4473; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
4474; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm15
4475; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12))
4476; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
4477; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm2, %xmm12
4478; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
4479; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
4480; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0]
4481; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
4482; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
4483; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7]
4484; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
4485; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
4486; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3]
4487; AVX512DQ-FCP-NEXT:    # ymm17 = mem[0,1,2,3,0,1,2,3]
4488; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
4489; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
4490; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7]
4491; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm15, %zmm17
4492; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
4493; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
4494; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7]
4495; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
4496; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4497; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
4498; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
4499; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
4500; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0]
4501; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm19, %ymm12
4502; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
4503; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7]
4504; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
4505; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7]
4506; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
4507; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
4508; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7]
4509; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
4510; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
4511; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4]
4512; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm19, %ymm15
4513; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
4514; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
4515; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0]
4516; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
4517; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
4518; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
4519; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12))
4520; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm14
4521; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
4522; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
4523; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3]
4524; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
4525; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
4526; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7]
4527; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm15, %ymm12
4528; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
4529; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
4530; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3]
4531; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
4532; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
4533; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
4534; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm18, %ymm15
4535; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
4536; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7]
4537; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
4538; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0]
4539; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
4540; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
4541; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
4542; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
4543; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6))
4544; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm6
4545; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
4546; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
4547; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7]
4548; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
4549; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4550; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
4551; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
4552; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm12, %zmm6
4553; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
4554; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
4555; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
4556; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
4557; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
4558; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5]
4559; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
4560; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
4561; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
4562; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
4563; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
4564; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
4565; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0]
4566; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
4567; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
4568; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
4569; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4570; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7]
4571; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm3, %ymm3
4572; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
4573; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
4574; AVX512DQ-FCP-NEXT:    movb $7, %al
4575; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
4576; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
4577; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
4578; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
4579; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
4580; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
4581; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
4582; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4583; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
4584; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4585; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
4586; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rsi)
4587; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
4588; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, (%rcx)
4589; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%r8)
4590; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
4591; AVX512DQ-FCP-NEXT:    vzeroupper
4592; AVX512DQ-FCP-NEXT:    retq
4593;
4594; AVX512BW-LABEL: load_i16_stride5_vf32:
4595; AVX512BW:       # %bb.0:
4596; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm0
4597; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
4598; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
4599; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm3
4600; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
4601; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
4602; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
4603; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm5
4604; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
4605; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm6
4606; AVX512BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
4607; AVX512BW-NEXT:    kmovd %eax, %k1
4608; AVX512BW-NEXT:    vmovdqu16 %zmm5, %zmm6 {%k1}
4609; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
4610; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm6, %zmm5
4611; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
4612; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
4613; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
4614; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
4615; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm7
4616; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm7 {%k1}
4617; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
4618; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm6
4619; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
4620; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4621; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
4622; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
4623; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm2, %zmm8
4624; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm8 {%k1}
4625; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
4626; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm8, %zmm7
4627; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
4628; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4629; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
4630; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
4631; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm2, %zmm9
4632; AVX512BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
4633; AVX512BW-NEXT:    kmovd %eax, %k1
4634; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
4635; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
4636; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm9, %zmm8
4637; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
4638; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm9
4639; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
4640; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
4641; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
4642; AVX512BW-NEXT:    movb $7, %al
4643; AVX512BW-NEXT:    kmovd %eax, %k1
4644; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
4645; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
4646; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
4647; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rsi)
4648; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rdx)
4649; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rcx)
4650; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%r8)
4651; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%r9)
4652; AVX512BW-NEXT:    vzeroupper
4653; AVX512BW-NEXT:    retq
4654;
4655; AVX512BW-FCP-LABEL: load_i16_stride5_vf32:
4656; AVX512BW-FCP:       # %bb.0:
4657; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm0
4658; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
4659; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
4660; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
4661; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
4662; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
4663; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
4664; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm5
4665; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
4666; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm1, %zmm6
4667; AVX512BW-FCP-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
4668; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
4669; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm5, %zmm6 {%k1}
4670; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
4671; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm6, %zmm5
4672; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
4673; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
4674; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
4675; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
4676; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm1, %zmm7
4677; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm6, %zmm7 {%k1}
4678; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
4679; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm6
4680; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
4681; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4682; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
4683; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
4684; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm2, %zmm8
4685; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm8 {%k1}
4686; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
4687; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm8, %zmm7
4688; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
4689; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4690; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
4691; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
4692; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm2, %zmm9
4693; AVX512BW-FCP-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
4694; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
4695; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
4696; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
4697; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm9, %zmm8
4698; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
4699; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm1, %zmm9
4700; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
4701; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
4702; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
4703; AVX512BW-FCP-NEXT:    movb $7, %al
4704; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
4705; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
4706; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
4707; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
4708; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rsi)
4709; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rdx)
4710; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
4711; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%r8)
4712; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
4713; AVX512BW-FCP-NEXT:    vzeroupper
4714; AVX512BW-FCP-NEXT:    retq
4715;
4716; AVX512DQ-BW-LABEL: load_i16_stride5_vf32:
4717; AVX512DQ-BW:       # %bb.0:
4718; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm0
4719; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
4720; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
4721; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm3
4722; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
4723; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
4724; AVX512DQ-BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
4725; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm5
4726; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
4727; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm6
4728; AVX512DQ-BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
4729; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
4730; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm5, %zmm6 {%k1}
4731; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
4732; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm6, %zmm5
4733; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
4734; AVX512DQ-BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
4735; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
4736; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
4737; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm7
4738; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm6, %zmm7 {%k1}
4739; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
4740; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm6
4741; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
4742; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4743; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
4744; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
4745; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm2, %zmm8
4746; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm7, %zmm8 {%k1}
4747; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
4748; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm8, %zmm7
4749; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
4750; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4751; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
4752; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
4753; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm2, %zmm9
4754; AVX512DQ-BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
4755; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
4756; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
4757; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
4758; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm9, %zmm8
4759; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
4760; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm9
4761; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
4762; AVX512DQ-BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
4763; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
4764; AVX512DQ-BW-NEXT:    movb $7, %al
4765; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
4766; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
4767; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
4768; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
4769; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, (%rsi)
4770; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, (%rdx)
4771; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rcx)
4772; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, (%r8)
4773; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%r9)
4774; AVX512DQ-BW-NEXT:    vzeroupper
4775; AVX512DQ-BW-NEXT:    retq
4776;
4777; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf32:
4778; AVX512DQ-BW-FCP:       # %bb.0:
4779; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm0
4780; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
4781; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
4782; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
4783; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
4784; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
4785; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
4786; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm5
4787; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
4788; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm1, %zmm6
4789; AVX512DQ-BW-FCP-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
4790; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
4791; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm5, %zmm6 {%k1}
4792; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
4793; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm6, %zmm5
4794; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
4795; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
4796; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
4797; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
4798; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm1, %zmm7
4799; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm6, %zmm7 {%k1}
4800; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
4801; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm6
4802; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
4803; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4804; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
4805; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
4806; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm2, %zmm8
4807; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm8 {%k1}
4808; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
4809; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm8, %zmm7
4810; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
4811; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4812; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
4813; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
4814; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm2, %zmm9
4815; AVX512DQ-BW-FCP-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
4816; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
4817; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
4818; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
4819; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm9, %zmm8
4820; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
4821; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm1, %zmm9
4822; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
4823; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
4824; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
4825; AVX512DQ-BW-FCP-NEXT:    movb $7, %al
4826; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
4827; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
4828; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
4829; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
4830; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rsi)
4831; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rdx)
4832; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
4833; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%r8)
4834; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
4835; AVX512DQ-BW-FCP-NEXT:    vzeroupper
4836; AVX512DQ-BW-FCP-NEXT:    retq
4837  %wide.vec = load <160 x i16>, ptr %in.vec, align 64
4838  %strided.vec0 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155>
4839  %strided.vec1 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156>
4840  %strided.vec2 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157>
4841  %strided.vec3 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158>
4842  %strided.vec4 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159>
4843  store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
4844  store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
4845  store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
4846  store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
4847  store <32 x i16> %strided.vec4, ptr %out.vec4, align 64
4848  ret void
4849}
4850
4851define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
4852; SSE-LABEL: load_i16_stride5_vf64:
4853; SSE:       # %bb.0:
4854; SSE-NEXT:    subq $1016, %rsp # imm = 0x3F8
4855; SSE-NEXT:    movdqa 464(%rdi), %xmm5
4856; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4857; SSE-NEXT:    movdqa 400(%rdi), %xmm8
4858; SSE-NEXT:    movdqa 416(%rdi), %xmm11
4859; SSE-NEXT:    movdqa 448(%rdi), %xmm4
4860; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4861; SSE-NEXT:    movdqa 432(%rdi), %xmm7
4862; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4863; SSE-NEXT:    movdqa 144(%rdi), %xmm6
4864; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4865; SSE-NEXT:    movdqa 80(%rdi), %xmm15
4866; SSE-NEXT:    movdqa 96(%rdi), %xmm10
4867; SSE-NEXT:    movdqa 128(%rdi), %xmm14
4868; SSE-NEXT:    movdqa 112(%rdi), %xmm2
4869; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4870; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
4871; SSE-NEXT:    movdqa %xmm0, %xmm1
4872; SSE-NEXT:    pandn %xmm2, %xmm1
4873; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3]
4874; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4875; SSE-NEXT:    pand %xmm0, %xmm2
4876; SSE-NEXT:    por %xmm1, %xmm2
4877; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
4878; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4879; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4880; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3]
4881; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
4882; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4883; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
4884; SSE-NEXT:    movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
4885; SSE-NEXT:    andps %xmm13, %xmm3
4886; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1]
4887; SSE-NEXT:    movaps %xmm13, %xmm2
4888; SSE-NEXT:    pandn %xmm1, %xmm2
4889; SSE-NEXT:    por %xmm3, %xmm2
4890; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4891; SSE-NEXT:    movdqa %xmm0, %xmm1
4892; SSE-NEXT:    pandn %xmm7, %xmm1
4893; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3]
4894; SSE-NEXT:    pand %xmm0, %xmm2
4895; SSE-NEXT:    por %xmm1, %xmm2
4896; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
4897; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4898; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4899; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3]
4900; SSE-NEXT:    movdqa %xmm8, %xmm6
4901; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4902; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7]
4903; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4904; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
4905; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
4906; SSE-NEXT:    movaps %xmm13, %xmm2
4907; SSE-NEXT:    andnps %xmm1, %xmm2
4908; SSE-NEXT:    movdqa 32(%rdi), %xmm3
4909; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4910; SSE-NEXT:    andps %xmm13, %xmm4
4911; SSE-NEXT:    orps %xmm4, %xmm2
4912; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4913; SSE-NEXT:    movdqa %xmm0, %xmm1
4914; SSE-NEXT:    pandn %xmm3, %xmm1
4915; SSE-NEXT:    movdqa 48(%rdi), %xmm2
4916; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4917; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4918; SSE-NEXT:    pand %xmm0, %xmm2
4919; SSE-NEXT:    por %xmm1, %xmm2
4920; SSE-NEXT:    movdqa 16(%rdi), %xmm1
4921; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4922; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
4923; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4924; SSE-NEXT:    movdqa (%rdi), %xmm9
4925; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
4926; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4927; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4928; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4929; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
4930; SSE-NEXT:    movdqa 64(%rdi), %xmm1
4931; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4932; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4933; SSE-NEXT:    movaps %xmm13, %xmm2
4934; SSE-NEXT:    andnps %xmm1, %xmm2
4935; SSE-NEXT:    andps %xmm13, %xmm4
4936; SSE-NEXT:    orps %xmm4, %xmm2
4937; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4938; SSE-NEXT:    movdqa 352(%rdi), %xmm2
4939; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4940; SSE-NEXT:    movdqa %xmm0, %xmm1
4941; SSE-NEXT:    pandn %xmm2, %xmm1
4942; SSE-NEXT:    movdqa 368(%rdi), %xmm2
4943; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4944; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4945; SSE-NEXT:    pand %xmm0, %xmm2
4946; SSE-NEXT:    por %xmm1, %xmm2
4947; SSE-NEXT:    movdqa 336(%rdi), %xmm1
4948; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
4949; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
4950; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4951; SSE-NEXT:    movdqa 320(%rdi), %xmm7
4952; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
4953; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4954; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4955; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4956; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
4957; SSE-NEXT:    movdqa 384(%rdi), %xmm1
4958; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4959; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4960; SSE-NEXT:    movaps %xmm13, %xmm2
4961; SSE-NEXT:    andnps %xmm1, %xmm2
4962; SSE-NEXT:    andps %xmm13, %xmm4
4963; SSE-NEXT:    orps %xmm4, %xmm2
4964; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4965; SSE-NEXT:    movdqa 272(%rdi), %xmm2
4966; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4967; SSE-NEXT:    movdqa %xmm0, %xmm1
4968; SSE-NEXT:    pandn %xmm2, %xmm1
4969; SSE-NEXT:    movdqa 288(%rdi), %xmm2
4970; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4971; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4972; SSE-NEXT:    pand %xmm0, %xmm2
4973; SSE-NEXT:    por %xmm1, %xmm2
4974; SSE-NEXT:    movdqa 256(%rdi), %xmm12
4975; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
4976; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4977; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4978; SSE-NEXT:    movdqa 240(%rdi), %xmm3
4979; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4980; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
4981; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4982; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4983; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
4984; SSE-NEXT:    movdqa 304(%rdi), %xmm1
4985; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4986; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4987; SSE-NEXT:    movaps %xmm13, %xmm2
4988; SSE-NEXT:    andnps %xmm1, %xmm2
4989; SSE-NEXT:    andps %xmm13, %xmm4
4990; SSE-NEXT:    orps %xmm4, %xmm2
4991; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4992; SSE-NEXT:    movdqa 592(%rdi), %xmm2
4993; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4994; SSE-NEXT:    movdqa %xmm0, %xmm1
4995; SSE-NEXT:    pandn %xmm2, %xmm1
4996; SSE-NEXT:    movdqa 608(%rdi), %xmm2
4997; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4998; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4999; SSE-NEXT:    pand %xmm0, %xmm2
5000; SSE-NEXT:    por %xmm1, %xmm2
5001; SSE-NEXT:    movdqa 576(%rdi), %xmm1
5002; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5003; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
5004; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
5005; SSE-NEXT:    movdqa 560(%rdi), %xmm3
5006; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
5007; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5008; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
5009; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5010; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
5011; SSE-NEXT:    movdqa 624(%rdi), %xmm1
5012; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5013; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
5014; SSE-NEXT:    movaps %xmm13, %xmm2
5015; SSE-NEXT:    andnps %xmm1, %xmm2
5016; SSE-NEXT:    andps %xmm13, %xmm4
5017; SSE-NEXT:    orps %xmm4, %xmm2
5018; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5019; SSE-NEXT:    movdqa 192(%rdi), %xmm2
5020; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5021; SSE-NEXT:    movdqa %xmm0, %xmm1
5022; SSE-NEXT:    pandn %xmm2, %xmm1
5023; SSE-NEXT:    movdqa 208(%rdi), %xmm2
5024; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5025; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3]
5026; SSE-NEXT:    pand %xmm0, %xmm4
5027; SSE-NEXT:    por %xmm1, %xmm4
5028; SSE-NEXT:    movdqa 176(%rdi), %xmm1
5029; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5030; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
5031; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
5032; SSE-NEXT:    movdqa 160(%rdi), %xmm2
5033; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5034; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3]
5035; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
5036; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
5037; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
5038; SSE-NEXT:    movdqa 224(%rdi), %xmm1
5039; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5040; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
5041; SSE-NEXT:    movaps %xmm13, %xmm4
5042; SSE-NEXT:    andnps %xmm1, %xmm4
5043; SSE-NEXT:    andps %xmm13, %xmm5
5044; SSE-NEXT:    orps %xmm5, %xmm4
5045; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5046; SSE-NEXT:    movdqa 528(%rdi), %xmm1
5047; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5048; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
5049; SSE-NEXT:    pand %xmm0, %xmm1
5050; SSE-NEXT:    movdqa 512(%rdi), %xmm2
5051; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5052; SSE-NEXT:    pandn %xmm2, %xmm0
5053; SSE-NEXT:    por %xmm1, %xmm0
5054; SSE-NEXT:    movdqa 496(%rdi), %xmm1
5055; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5056; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
5057; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
5058; SSE-NEXT:    movdqa 480(%rdi), %xmm2
5059; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5060; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
5061; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
5062; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5063; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
5064; SSE-NEXT:    movdqa 544(%rdi), %xmm0
5065; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5066; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5067; SSE-NEXT:    movaps %xmm13, %xmm1
5068; SSE-NEXT:    andnps %xmm0, %xmm1
5069; SSE-NEXT:    andps %xmm13, %xmm4
5070; SSE-NEXT:    orps %xmm4, %xmm1
5071; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5072; SSE-NEXT:    psrlq $48, %xmm10
5073; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5074; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3]
5075; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
5076; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
5077; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535]
5078; SSE-NEXT:    movdqa %xmm0, %xmm4
5079; SSE-NEXT:    pandn %xmm1, %xmm4
5080; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3]
5081; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5082; SSE-NEXT:    # xmm5 = mem[0,2,2,3]
5083; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
5084; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7]
5085; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
5086; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
5087; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
5088; SSE-NEXT:    pand %xmm0, %xmm1
5089; SSE-NEXT:    por %xmm4, %xmm1
5090; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5091; SSE-NEXT:    movdqa %xmm8, %xmm4
5092; SSE-NEXT:    psllq $48, %xmm4
5093; SSE-NEXT:    movaps %xmm13, %xmm2
5094; SSE-NEXT:    andnps %xmm4, %xmm2
5095; SSE-NEXT:    pand %xmm13, %xmm1
5096; SSE-NEXT:    orps %xmm1, %xmm2
5097; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5098; SSE-NEXT:    psrlq $48, %xmm11
5099; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3]
5100; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
5101; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1]
5102; SSE-NEXT:    movdqa %xmm0, %xmm1
5103; SSE-NEXT:    pandn %xmm4, %xmm1
5104; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5105; SSE-NEXT:    # xmm4 = mem[1,3,2,3]
5106; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5107; SSE-NEXT:    # xmm5 = mem[0,2,2,3]
5108; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5109; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
5110; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
5111; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
5112; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
5113; SSE-NEXT:    pand %xmm0, %xmm4
5114; SSE-NEXT:    por %xmm1, %xmm4
5115; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5116; SSE-NEXT:    psllq $48, %xmm1
5117; SSE-NEXT:    movdqa %xmm13, %xmm2
5118; SSE-NEXT:    pandn %xmm1, %xmm2
5119; SSE-NEXT:    pand %xmm13, %xmm4
5120; SSE-NEXT:    por %xmm4, %xmm2
5121; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5122; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5123; SSE-NEXT:    psrlq $48, %xmm1
5124; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,3,2,3]
5125; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
5126; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5127; SSE-NEXT:    movdqa %xmm0, %xmm1
5128; SSE-NEXT:    pandn %xmm4, %xmm1
5129; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5130; SSE-NEXT:    # xmm4 = mem[1,3,2,3]
5131; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5132; SSE-NEXT:    # xmm5 = mem[0,2,2,3]
5133; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5134; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
5135; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
5136; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
5137; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
5138; SSE-NEXT:    pand %xmm0, %xmm4
5139; SSE-NEXT:    por %xmm1, %xmm4
5140; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5141; SSE-NEXT:    psllq $48, %xmm1
5142; SSE-NEXT:    movdqa %xmm13, %xmm2
5143; SSE-NEXT:    pandn %xmm1, %xmm2
5144; SSE-NEXT:    pand %xmm13, %xmm4
5145; SSE-NEXT:    por %xmm4, %xmm2
5146; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5147; SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
5148; SSE-NEXT:    psrlq $48, %xmm1
5149; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3]
5150; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
5151; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5152; SSE-NEXT:    movdqa %xmm0, %xmm1
5153; SSE-NEXT:    pandn %xmm4, %xmm1
5154; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5155; SSE-NEXT:    # xmm4 = mem[1,3,2,3]
5156; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5157; SSE-NEXT:    # xmm5 = mem[0,2,2,3]
5158; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5159; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
5160; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
5161; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
5162; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
5163; SSE-NEXT:    pand %xmm0, %xmm4
5164; SSE-NEXT:    por %xmm1, %xmm4
5165; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5166; SSE-NEXT:    psllq $48, %xmm1
5167; SSE-NEXT:    movdqa %xmm13, %xmm2
5168; SSE-NEXT:    pandn %xmm1, %xmm2
5169; SSE-NEXT:    pand %xmm13, %xmm4
5170; SSE-NEXT:    por %xmm4, %xmm2
5171; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5172; SSE-NEXT:    psrlq $48, %xmm12
5173; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5174; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[0,3,2,3]
5175; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
5176; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
5177; SSE-NEXT:    movdqa %xmm0, %xmm1
5178; SSE-NEXT:    pandn %xmm4, %xmm1
5179; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5180; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3]
5181; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5182; SSE-NEXT:    # xmm5 = mem[0,2,2,3]
5183; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5184; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
5185; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
5186; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
5187; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
5188; SSE-NEXT:    pand %xmm0, %xmm4
5189; SSE-NEXT:    por %xmm1, %xmm4
5190; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5191; SSE-NEXT:    movdqa %xmm12, %xmm1
5192; SSE-NEXT:    psllq $48, %xmm1
5193; SSE-NEXT:    movdqa %xmm13, %xmm2
5194; SSE-NEXT:    pandn %xmm1, %xmm2
5195; SSE-NEXT:    pand %xmm13, %xmm4
5196; SSE-NEXT:    por %xmm4, %xmm2
5197; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5198; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5199; SSE-NEXT:    psrlq $48, %xmm1
5200; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3]
5201; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
5202; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5203; SSE-NEXT:    movdqa %xmm0, %xmm1
5204; SSE-NEXT:    pandn %xmm4, %xmm1
5205; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5206; SSE-NEXT:    # xmm4 = mem[1,3,2,3]
5207; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5208; SSE-NEXT:    # xmm5 = mem[0,2,2,3]
5209; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5210; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
5211; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
5212; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
5213; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
5214; SSE-NEXT:    pand %xmm0, %xmm4
5215; SSE-NEXT:    por %xmm1, %xmm4
5216; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5217; SSE-NEXT:    psllq $48, %xmm1
5218; SSE-NEXT:    movdqa %xmm13, %xmm2
5219; SSE-NEXT:    pandn %xmm1, %xmm2
5220; SSE-NEXT:    pand %xmm13, %xmm4
5221; SSE-NEXT:    por %xmm4, %xmm2
5222; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5223; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5224; SSE-NEXT:    movdqa %xmm7, %xmm1
5225; SSE-NEXT:    psrlq $48, %xmm1
5226; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5227; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[0,3,2,3]
5228; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
5229; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5230; SSE-NEXT:    movdqa %xmm0, %xmm1
5231; SSE-NEXT:    pandn %xmm4, %xmm1
5232; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5233; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
5234; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5235; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
5236; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5237; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
5238; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
5239; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
5240; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
5241; SSE-NEXT:    pand %xmm0, %xmm4
5242; SSE-NEXT:    por %xmm1, %xmm4
5243; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5244; SSE-NEXT:    movdqa %xmm3, %xmm1
5245; SSE-NEXT:    psllq $48, %xmm1
5246; SSE-NEXT:    movdqa %xmm13, %xmm5
5247; SSE-NEXT:    pandn %xmm1, %xmm5
5248; SSE-NEXT:    pand %xmm13, %xmm4
5249; SSE-NEXT:    por %xmm4, %xmm5
5250; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5251; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5252; SSE-NEXT:    psrlq $48, %xmm1
5253; SSE-NEXT:    pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5254; SSE-NEXT:    # xmm4 = mem[0,3,2,3]
5255; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
5256; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5257; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5258; SSE-NEXT:    # xmm1 = mem[1,3,2,3]
5259; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5260; SSE-NEXT:    # xmm5 = mem[0,2,2,3]
5261; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
5262; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7]
5263; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
5264; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
5265; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
5266; SSE-NEXT:    pand %xmm0, %xmm1
5267; SSE-NEXT:    pandn %xmm4, %xmm0
5268; SSE-NEXT:    por %xmm1, %xmm0
5269; SSE-NEXT:    pand %xmm13, %xmm0
5270; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5271; SSE-NEXT:    psllq $48, %xmm1
5272; SSE-NEXT:    pandn %xmm1, %xmm13
5273; SSE-NEXT:    por %xmm0, %xmm13
5274; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5275; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5276; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5277; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
5278; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
5279; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
5280; SSE-NEXT:    movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535]
5281; SSE-NEXT:    movaps %xmm6, %xmm4
5282; SSE-NEXT:    andnps %xmm1, %xmm4
5283; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm15[0,1,1,3]
5284; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
5285; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5286; SSE-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
5287; SSE-NEXT:    pand %xmm6, %xmm5
5288; SSE-NEXT:    por %xmm4, %xmm5
5289; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5290; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,1,2,0]
5291; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
5292; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
5293; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0]
5294; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5295; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5296; SSE-NEXT:    movaps %xmm0, %xmm1
5297; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5298; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0]
5299; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[2,3]
5300; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
5301; SSE-NEXT:    movaps %xmm6, %xmm4
5302; SSE-NEXT:    andnps %xmm1, %xmm4
5303; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5304; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[0,1,1,3]
5305; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
5306; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5307; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
5308; SSE-NEXT:    pand %xmm6, %xmm5
5309; SSE-NEXT:    por %xmm4, %xmm5
5310; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5311; SSE-NEXT:    pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5312; SSE-NEXT:    # xmm4 = mem[0,1,2,0]
5313; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
5314; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
5315; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0]
5316; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5317; SSE-NEXT:    movdqa %xmm10, %xmm1
5318; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5319; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0]
5320; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3]
5321; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
5322; SSE-NEXT:    movaps %xmm6, %xmm4
5323; SSE-NEXT:    andnps %xmm1, %xmm4
5324; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm14[0,1,1,3]
5325; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
5326; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5327; SSE-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
5328; SSE-NEXT:    pand %xmm6, %xmm5
5329; SSE-NEXT:    por %xmm4, %xmm5
5330; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5331; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,1,2,0]
5332; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
5333; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
5334; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0]
5335; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5336; SSE-NEXT:    movdqa %xmm2, %xmm1
5337; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0]
5338; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[2,3]
5339; SSE-NEXT:    movdqa %xmm9, %xmm12
5340; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
5341; SSE-NEXT:    movaps %xmm6, %xmm4
5342; SSE-NEXT:    andnps %xmm1, %xmm4
5343; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm11[0,1,1,3]
5344; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
5345; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
5346; SSE-NEXT:    pand %xmm6, %xmm2
5347; SSE-NEXT:    por %xmm4, %xmm2
5348; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5349; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0]
5350; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
5351; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
5352; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
5353; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5354; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5355; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5356; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
5357; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
5358; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
5359; SSE-NEXT:    movaps %xmm6, %xmm4
5360; SSE-NEXT:    andnps %xmm1, %xmm4
5361; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5362; SSE-NEXT:    # xmm5 = mem[0,1,1,3]
5363; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
5364; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5365; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
5366; SSE-NEXT:    pand %xmm6, %xmm2
5367; SSE-NEXT:    por %xmm4, %xmm2
5368; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5369; SSE-NEXT:    pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5370; SSE-NEXT:    # xmm4 = mem[0,1,2,0]
5371; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
5372; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
5373; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
5374; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5375; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5376; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5377; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[0,0]
5378; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,3]
5379; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
5380; SSE-NEXT:    movaps %xmm6, %xmm4
5381; SSE-NEXT:    andnps %xmm1, %xmm4
5382; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5383; SSE-NEXT:    # xmm5 = mem[0,1,1,3]
5384; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
5385; SSE-NEXT:    movdqa (%rsp), %xmm9 # 16-byte Reload
5386; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
5387; SSE-NEXT:    pand %xmm6, %xmm2
5388; SSE-NEXT:    por %xmm4, %xmm2
5389; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5390; SSE-NEXT:    pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5391; SSE-NEXT:    # xmm4 = mem[0,1,2,0]
5392; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
5393; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
5394; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
5395; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5396; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5397; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5398; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
5399; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
5400; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
5401; SSE-NEXT:    movaps %xmm6, %xmm4
5402; SSE-NEXT:    andnps %xmm1, %xmm4
5403; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5404; SSE-NEXT:    # xmm5 = mem[0,1,1,3]
5405; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
5406; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5407; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
5408; SSE-NEXT:    pand %xmm6, %xmm2
5409; SSE-NEXT:    por %xmm4, %xmm2
5410; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5411; SSE-NEXT:    pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5412; SSE-NEXT:    # xmm4 = mem[0,1,2,0]
5413; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
5414; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
5415; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
5416; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5417; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5418; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5419; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0]
5420; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3]
5421; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
5422; SSE-NEXT:    movaps %xmm6, %xmm4
5423; SSE-NEXT:    andnps %xmm1, %xmm4
5424; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5425; SSE-NEXT:    # xmm5 = mem[0,1,1,3]
5426; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
5427; SSE-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5428; SSE-NEXT:    # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
5429; SSE-NEXT:    pand %xmm6, %xmm2
5430; SSE-NEXT:    por %xmm4, %xmm2
5431; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5432; SSE-NEXT:    pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5433; SSE-NEXT:    # xmm4 = mem[0,1,2,0]
5434; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
5435; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
5436; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
5437; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5438; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
5439; SSE-NEXT:    movdqa %xmm6, %xmm4
5440; SSE-NEXT:    pandn %xmm1, %xmm4
5441; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
5442; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3]
5443; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
5444; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7]
5445; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5446; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
5447; SSE-NEXT:    pand %xmm6, %xmm1
5448; SSE-NEXT:    por %xmm4, %xmm1
5449; SSE-NEXT:    movdqa %xmm1, %xmm2
5450; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,0]
5451; SSE-NEXT:    movaps %xmm6, %xmm3
5452; SSE-NEXT:    andnps %xmm13, %xmm3
5453; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5454; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2]
5455; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,4,6,7]
5456; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5457; SSE-NEXT:    # xmm3 = mem[0,1,0,3]
5458; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
5459; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3]
5460; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0]
5461; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5462; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5463; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm14[2,2,2,2,4,5,6,7]
5464; SSE-NEXT:    movdqa %xmm6, %xmm3
5465; SSE-NEXT:    pandn %xmm1, %xmm3
5466; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5467; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
5468; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5469; SSE-NEXT:    # xmm4 = mem[0,2,2,3]
5470; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5471; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
5472; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5473; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
5474; SSE-NEXT:    pand %xmm6, %xmm1
5475; SSE-NEXT:    por %xmm3, %xmm1
5476; SSE-NEXT:    movdqa %xmm1, %xmm4
5477; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5478; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0]
5479; SSE-NEXT:    movaps %xmm6, %xmm2
5480; SSE-NEXT:    andnps %xmm14, %xmm2
5481; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5482; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[0,2]
5483; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,4,6,7]
5484; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5485; SSE-NEXT:    # xmm3 = mem[0,1,0,3]
5486; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
5487; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3]
5488; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
5489; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5490; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7]
5491; SSE-NEXT:    movdqa %xmm6, %xmm3
5492; SSE-NEXT:    pandn %xmm1, %xmm3
5493; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
5494; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5495; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
5496; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5497; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
5498; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5499; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
5500; SSE-NEXT:    pand %xmm6, %xmm1
5501; SSE-NEXT:    por %xmm3, %xmm1
5502; SSE-NEXT:    movdqa %xmm1, %xmm3
5503; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5504; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0]
5505; SSE-NEXT:    movaps %xmm6, %xmm2
5506; SSE-NEXT:    andnps %xmm12, %xmm2
5507; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5508; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2]
5509; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,4,6,7]
5510; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5511; SSE-NEXT:    # xmm2 = mem[0,1,0,3]
5512; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
5513; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
5514; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
5515; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5516; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5517; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
5518; SSE-NEXT:    movdqa %xmm6, %xmm2
5519; SSE-NEXT:    pandn %xmm1, %xmm2
5520; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5521; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
5522; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5523; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
5524; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5525; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
5526; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5527; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
5528; SSE-NEXT:    pand %xmm6, %xmm1
5529; SSE-NEXT:    por %xmm2, %xmm1
5530; SSE-NEXT:    movdqa %xmm1, %xmm3
5531; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5532; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0]
5533; SSE-NEXT:    movaps %xmm4, %xmm2
5534; SSE-NEXT:    movaps %xmm6, %xmm4
5535; SSE-NEXT:    andnps %xmm2, %xmm4
5536; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5537; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
5538; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7]
5539; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5540; SSE-NEXT:    # xmm2 = mem[0,1,0,3]
5541; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
5542; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
5543; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
5544; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5545; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7]
5546; SSE-NEXT:    movdqa %xmm6, %xmm2
5547; SSE-NEXT:    pandn %xmm1, %xmm2
5548; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5549; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
5550; SSE-NEXT:    movdqa %xmm9, %xmm11
5551; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
5552; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5553; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
5554; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5555; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7]
5556; SSE-NEXT:    pand %xmm6, %xmm0
5557; SSE-NEXT:    por %xmm2, %xmm0
5558; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5559; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0]
5560; SSE-NEXT:    movaps %xmm6, %xmm2
5561; SSE-NEXT:    andnps %xmm7, %xmm2
5562; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5563; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2]
5564; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7]
5565; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5566; SSE-NEXT:    # xmm2 = mem[0,1,0,3]
5567; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
5568; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
5569; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
5570; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5571; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5572; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
5573; SSE-NEXT:    movdqa %xmm6, %xmm2
5574; SSE-NEXT:    pandn %xmm1, %xmm2
5575; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5576; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1]
5577; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5578; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
5579; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5580; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
5581; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5582; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7]
5583; SSE-NEXT:    pand %xmm6, %xmm0
5584; SSE-NEXT:    por %xmm2, %xmm0
5585; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5586; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0]
5587; SSE-NEXT:    movaps %xmm4, %xmm2
5588; SSE-NEXT:    movaps %xmm6, %xmm3
5589; SSE-NEXT:    andnps %xmm4, %xmm3
5590; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5591; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
5592; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7]
5593; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5594; SSE-NEXT:    # xmm2 = mem[0,1,0,3]
5595; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
5596; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
5597; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
5598; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5599; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7]
5600; SSE-NEXT:    movdqa %xmm6, %xmm2
5601; SSE-NEXT:    pandn %xmm1, %xmm2
5602; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5603; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
5604; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5605; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
5606; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5607; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
5608; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5609; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm1[1,0,3,3,4,5,6,7]
5610; SSE-NEXT:    pand %xmm6, %xmm14
5611; SSE-NEXT:    por %xmm2, %xmm14
5612; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5613; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0]
5614; SSE-NEXT:    movaps %xmm6, %xmm0
5615; SSE-NEXT:    andnps %xmm10, %xmm0
5616; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5617; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2]
5618; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,7,4,6,7]
5619; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5620; SSE-NEXT:    # xmm2 = mem[0,1,0,3]
5621; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
5622; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
5623; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,0]
5624; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5625; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7]
5626; SSE-NEXT:    movdqa %xmm6, %xmm2
5627; SSE-NEXT:    pandn %xmm1, %xmm2
5628; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5629; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
5630; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5631; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
5632; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5633; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
5634; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5635; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm1[1,0,3,3,4,5,6,7]
5636; SSE-NEXT:    pand %xmm6, %xmm10
5637; SSE-NEXT:    por %xmm2, %xmm10
5638; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5639; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5640; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
5641; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
5642; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5643; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5644; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
5645; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
5646; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5647; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0]
5648; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2]
5649; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5650; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0]
5651; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2]
5652; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5653; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0]
5654; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2]
5655; SSE-NEXT:    movaps %xmm11, (%rsp) # 16-byte Spill
5656; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[2,0],xmm9[3,0]
5657; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[0,2]
5658; SSE-NEXT:    movdqa %xmm7, %xmm1
5659; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[3,0]
5660; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0,2]
5661; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5662; SSE-NEXT:    movdqa %xmm4, %xmm1
5663; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[3,0]
5664; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2]
5665; SSE-NEXT:    movaps %xmm1, %xmm15
5666; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5667; SSE-NEXT:    movaps %xmm4, %xmm1
5668; SSE-NEXT:    movaps %xmm4, %xmm12
5669; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
5670; SSE-NEXT:    movaps %xmm0, %xmm11
5671; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
5672; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5673; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7]
5674; SSE-NEXT:    pand %xmm6, %xmm8
5675; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
5676; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5677; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
5678; SSE-NEXT:    pand %xmm6, %xmm7
5679; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7]
5680; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5681; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7]
5682; SSE-NEXT:    pand %xmm6, %xmm0
5683; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
5684; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5685; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7]
5686; SSE-NEXT:    pand %xmm6, %xmm5
5687; SSE-NEXT:    pshufhw $232, (%rsp), %xmm2 # 16-byte Folded Reload
5688; SSE-NEXT:    # xmm2 = mem[0,1,2,3,4,6,6,7]
5689; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5690; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7]
5691; SSE-NEXT:    pand %xmm6, %xmm4
5692; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7]
5693; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5694; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
5695; SSE-NEXT:    pand %xmm6, %xmm3
5696; SSE-NEXT:    pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5697; SSE-NEXT:    # xmm2 = mem[0,1,2,3,4,6,6,7]
5698; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5699; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
5700; SSE-NEXT:    pand %xmm6, %xmm2
5701; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7]
5702; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
5703; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[2,1,3,3,4,5,6,7]
5704; SSE-NEXT:    pand %xmm6, %xmm9
5705; SSE-NEXT:    andnps %xmm11, %xmm6
5706; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2]
5707; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7]
5708; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5709; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm11[0,1,0,3]
5710; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
5711; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[2,3]
5712; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0]
5713; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
5714; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5715; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
5716; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5717; SSE-NEXT:    # xmm15 = mem[0,1,1,3]
5718; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5719; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
5720; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
5721; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,0]
5722; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5723; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5724; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
5725; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5726; SSE-NEXT:    # xmm15 = mem[0,1,1,3]
5727; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5728; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
5729; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
5730; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,0]
5731; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5732; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5733; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
5734; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5735; SSE-NEXT:    # xmm15 = mem[0,1,1,3]
5736; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5737; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
5738; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
5739; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0]
5740; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5741; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5742; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
5743; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5744; SSE-NEXT:    # xmm15 = mem[0,1,1,3]
5745; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5746; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
5747; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
5748; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0]
5749; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5750; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5751; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
5752; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5753; SSE-NEXT:    # xmm15 = mem[0,1,1,3]
5754; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5755; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
5756; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
5757; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0]
5758; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5759; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5760; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
5761; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5762; SSE-NEXT:    # xmm15 = mem[0,1,1,3]
5763; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5764; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
5765; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
5766; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0]
5767; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5768; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5769; SSE-NEXT:    # xmm1 = mem[0,2,2,3]
5770; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5771; SSE-NEXT:    # xmm15 = mem[0,1,1,3]
5772; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5773; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
5774; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
5775; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0]
5776; SSE-NEXT:    orps %xmm9, %xmm6
5777; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[0,2,2,3]
5778; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm11[0,1,1,3]
5779; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5780; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
5781; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[1,3]
5782; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,0]
5783; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5784; SSE-NEXT:    movaps %xmm1, 96(%rsi)
5785; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5786; SSE-NEXT:    movaps %xmm1, 32(%rsi)
5787; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5788; SSE-NEXT:    movaps %xmm1, 112(%rsi)
5789; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5790; SSE-NEXT:    movaps %xmm1, 48(%rsi)
5791; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5792; SSE-NEXT:    movaps %xmm1, 64(%rsi)
5793; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5794; SSE-NEXT:    movaps %xmm1, (%rsi)
5795; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5796; SSE-NEXT:    movaps %xmm1, 80(%rsi)
5797; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5798; SSE-NEXT:    movaps %xmm1, 16(%rsi)
5799; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5800; SSE-NEXT:    movaps %xmm1, 96(%rdx)
5801; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5802; SSE-NEXT:    movaps %xmm1, 32(%rdx)
5803; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5804; SSE-NEXT:    movaps %xmm1, 112(%rdx)
5805; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5806; SSE-NEXT:    movaps %xmm1, 48(%rdx)
5807; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5808; SSE-NEXT:    movaps %xmm1, 64(%rdx)
5809; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5810; SSE-NEXT:    movaps %xmm1, (%rdx)
5811; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5812; SSE-NEXT:    movaps %xmm1, 80(%rdx)
5813; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5814; SSE-NEXT:    movaps %xmm1, 16(%rdx)
5815; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5816; SSE-NEXT:    movaps %xmm1, 96(%rcx)
5817; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5818; SSE-NEXT:    movaps %xmm1, 112(%rcx)
5819; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5820; SSE-NEXT:    movaps %xmm1, 64(%rcx)
5821; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5822; SSE-NEXT:    movaps %xmm1, 80(%rcx)
5823; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5824; SSE-NEXT:    movaps %xmm1, 32(%rcx)
5825; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5826; SSE-NEXT:    movaps %xmm1, 48(%rcx)
5827; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5828; SSE-NEXT:    movaps %xmm1, (%rcx)
5829; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5830; SSE-NEXT:    movaps %xmm1, 16(%rcx)
5831; SSE-NEXT:    movaps %xmm10, 112(%r8)
5832; SSE-NEXT:    movaps %xmm14, 96(%r8)
5833; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5834; SSE-NEXT:    movaps %xmm1, 80(%r8)
5835; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5836; SSE-NEXT:    movaps %xmm1, 64(%r8)
5837; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5838; SSE-NEXT:    movaps %xmm1, 48(%r8)
5839; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5840; SSE-NEXT:    movaps %xmm1, 32(%r8)
5841; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5842; SSE-NEXT:    movaps %xmm1, 16(%r8)
5843; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5844; SSE-NEXT:    movaps %xmm1, (%r8)
5845; SSE-NEXT:    movaps %xmm6, 112(%r9)
5846; SSE-NEXT:    movaps %xmm2, 96(%r9)
5847; SSE-NEXT:    movaps %xmm3, 80(%r9)
5848; SSE-NEXT:    movaps %xmm4, 64(%r9)
5849; SSE-NEXT:    movaps %xmm5, 48(%r9)
5850; SSE-NEXT:    movaps %xmm0, 32(%r9)
5851; SSE-NEXT:    movaps %xmm7, 16(%r9)
5852; SSE-NEXT:    movaps %xmm8, (%r9)
5853; SSE-NEXT:    addq $1016, %rsp # imm = 0x3F8
5854; SSE-NEXT:    retq
5855;
5856; AVX-LABEL: load_i16_stride5_vf64:
5857; AVX:       # %bb.0:
5858; AVX-NEXT:    subq $1032, %rsp # imm = 0x408
5859; AVX-NEXT:    vmovdqa 304(%rdi), %xmm0
5860; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5861; AVX-NEXT:    vmovdqa 288(%rdi), %xmm1
5862; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5863; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
5864; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
5865; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
5866; AVX-NEXT:    vmovdqa 256(%rdi), %xmm2
5867; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5868; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
5869; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5870; AVX-NEXT:    vmovdqa 272(%rdi), %xmm15
5871; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1]
5872; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5873; AVX-NEXT:    vmovdqa 240(%rdi), %xmm3
5874; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5875; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
5876; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
5877; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
5878; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
5879; AVX-NEXT:    vmovdqa 208(%rdi), %xmm0
5880; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5881; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
5882; AVX-NEXT:    vmovdqa 192(%rdi), %xmm3
5883; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5884; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
5885; AVX-NEXT:    vmovdqa 176(%rdi), %xmm3
5886; AVX-NEXT:    vmovdqa %xmm3, (%rsp) # 16-byte Spill
5887; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
5888; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
5889; AVX-NEXT:    vmovdqa 160(%rdi), %xmm4
5890; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5891; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
5892; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
5893; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5894; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7]
5895; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
5896; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
5897; AVX-NEXT:    vmovaps 224(%rdi), %xmm0
5898; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5899; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
5900; AVX-NEXT:    vandnps %ymm4, %ymm5, %ymm4
5901; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
5902; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
5903; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5904; AVX-NEXT:    vmovdqa 576(%rdi), %xmm0
5905; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5906; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5907; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5908; AVX-NEXT:    vmovdqa 592(%rdi), %xmm12
5909; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1]
5910; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5911; AVX-NEXT:    vmovdqa 560(%rdi), %xmm0
5912; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5913; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
5914; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
5915; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
5916; AVX-NEXT:    vmovdqa 624(%rdi), %xmm3
5917; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5918; AVX-NEXT:    vmovdqa 608(%rdi), %xmm0
5919; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5920; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
5921; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
5922; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
5923; AVX-NEXT:    vmovdqa 496(%rdi), %xmm0
5924; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5925; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
5926; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
5927; AVX-NEXT:    vmovdqa 480(%rdi), %xmm9
5928; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
5929; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5930; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
5931; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5932; AVX-NEXT:    vmovdqa 528(%rdi), %xmm0
5933; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5934; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
5935; AVX-NEXT:    vmovdqa 512(%rdi), %xmm13
5936; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7]
5937; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5938; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
5939; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
5940; AVX-NEXT:    vmovaps 544(%rdi), %xmm11
5941; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm11[0,1,0,1]
5942; AVX-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5943; AVX-NEXT:    vandnps %ymm4, %ymm5, %ymm4
5944; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
5945; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
5946; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5947; AVX-NEXT:    vmovdqa 96(%rdi), %xmm10
5948; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3]
5949; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5950; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5951; AVX-NEXT:    vmovdqa 112(%rdi), %xmm0
5952; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5953; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
5954; AVX-NEXT:    vmovdqa 80(%rdi), %xmm0
5955; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5956; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
5957; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
5958; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
5959; AVX-NEXT:    vmovdqa 144(%rdi), %xmm7
5960; AVX-NEXT:    vmovdqa 128(%rdi), %xmm6
5961; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7]
5962; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5963; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5964; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
5965; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
5966; AVX-NEXT:    vmovdqa 16(%rdi), %xmm0
5967; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5968; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
5969; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
5970; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5971; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5972; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
5973; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
5974; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5975; AVX-NEXT:    vmovdqa 32(%rdi), %xmm0
5976; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5977; AVX-NEXT:    vmovdqa 48(%rdi), %xmm4
5978; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5979; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
5980; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6,7]
5981; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
5982; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
5983; AVX-NEXT:    vmovaps 64(%rdi), %xmm0
5984; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5985; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
5986; AVX-NEXT:    vandnps %ymm4, %ymm5, %ymm4
5987; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
5988; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
5989; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5990; AVX-NEXT:    vmovdqa 464(%rdi), %xmm8
5991; AVX-NEXT:    vmovdqa 448(%rdi), %xmm0
5992; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5993; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
5994; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5995; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
5996; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
5997; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5998; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5999; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
6000; AVX-NEXT:    vmovdqa 432(%rdi), %xmm0
6001; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6002; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
6003; AVX-NEXT:    vmovdqa 400(%rdi), %xmm0
6004; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6005; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
6006; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
6007; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
6008; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
6009; AVX-NEXT:    vmovdqa 336(%rdi), %xmm0
6010; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6011; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
6012; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
6013; AVX-NEXT:    vmovdqa 320(%rdi), %xmm0
6014; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6015; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
6016; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
6017; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6018; AVX-NEXT:    vmovdqa 368(%rdi), %xmm0
6019; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6020; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
6021; AVX-NEXT:    vmovdqa 352(%rdi), %xmm0
6022; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6023; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7]
6024; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
6025; AVX-NEXT:    vandps %ymm5, %ymm2, %ymm2
6026; AVX-NEXT:    vmovaps 384(%rdi), %xmm0
6027; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6028; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1]
6029; AVX-NEXT:    vandnps %ymm3, %ymm5, %ymm3
6030; AVX-NEXT:    vorps %ymm3, %ymm2, %ymm2
6031; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
6032; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6033; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
6034; AVX-NEXT:    # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
6035; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7]
6036; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
6037; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6038; AVX-NEXT:    # xmm2 = mem[0,3,2,3]
6039; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
6040; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
6041; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6042; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6043; AVX-NEXT:    # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
6044; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
6045; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
6046; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
6047; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6048; AVX-NEXT:    # xmm1 = mem[0,3,2,3]
6049; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
6050; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
6051; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm15
6052; AVX-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
6053; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6054; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6055; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
6056; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
6057; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
6058; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7]
6059; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
6060; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6061; AVX-NEXT:    vpsllq $48, %xmm4, %xmm15
6062; AVX-NEXT:    vandnps %ymm15, %ymm5, %ymm15
6063; AVX-NEXT:    vorps %ymm0, %ymm15, %ymm0
6064; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
6065; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6066; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload
6067; AVX-NEXT:    # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
6068; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
6069; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
6070; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
6071; AVX-NEXT:    # xmm3 = mem[0,3,2,3]
6072; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
6073; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
6074; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6075; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
6076; AVX-NEXT:    # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7]
6077; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
6078; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
6079; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3]
6080; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
6081; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6082; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm15
6083; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1]
6084; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6085; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7]
6086; AVX-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
6087; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7]
6088; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
6089; AVX-NEXT:    vpsllq $48, %xmm11, %xmm15
6090; AVX-NEXT:    vandnps %ymm15, %ymm5, %ymm15
6091; AVX-NEXT:    vorps %ymm3, %ymm15, %ymm3
6092; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
6093; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6094; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6095; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7]
6096; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
6097; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
6098; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6099; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3]
6100; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
6101; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
6102; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7]
6103; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
6104; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
6105; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6106; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3]
6107; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
6108; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6109; AVX-NEXT:    vpsrlq $48, %xmm10, %xmm15
6110; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1]
6111; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6112; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6113; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
6114; AVX-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
6115; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7]
6116; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
6117; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
6118; AVX-NEXT:    vpsllq $48, %xmm7, %xmm15
6119; AVX-NEXT:    vandnps %ymm15, %ymm5, %ymm15
6120; AVX-NEXT:    vorps %ymm3, %ymm15, %ymm3
6121; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
6122; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6123; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
6124; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7]
6125; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
6126; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6127; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload
6128; AVX-NEXT:    # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7]
6129; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7]
6130; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
6131; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
6132; AVX-NEXT:    # xmm3 = mem[0,3,2,3]
6133; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
6134; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
6135; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7]
6136; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6137; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6138; AVX-NEXT:    # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5],mem[6,7]
6139; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
6140; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6141; AVX-NEXT:    # xmm2 = mem[0,3,2,3]
6142; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
6143; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6144; AVX-NEXT:    vpsrlq $48, %xmm3, %xmm3
6145; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
6146; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6147; AVX-NEXT:    vandps %ymm5, %ymm1, %ymm1
6148; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6149; AVX-NEXT:    vpsllq $48, %xmm2, %xmm2
6150; AVX-NEXT:    vandnps %ymm2, %ymm5, %ymm2
6151; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm1
6152; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
6153; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6154; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6155; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
6156; AVX-NEXT:    # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
6157; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13]
6158; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
6159; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6160; AVX-NEXT:    # xmm2 = mem[3,1,2,3]
6161; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
6162; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
6163; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6164; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
6165; AVX-NEXT:    # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
6166; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
6167; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
6168; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
6169; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6170; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
6171; AVX-NEXT:    # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7]
6172; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
6173; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
6174; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6175; AVX-NEXT:    # xmm15 = mem[0,1,1,3]
6176; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
6177; AVX-NEXT:    vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload
6178; AVX-NEXT:    # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
6179; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
6180; AVX-NEXT:    vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6181; AVX-NEXT:    # xmm15 = mem[0,1,2,0]
6182; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
6183; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
6184; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
6185; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6186; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6187; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6188; AVX-NEXT:    # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
6189; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
6190; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
6191; AVX-NEXT:    # xmm5 = mem[3,1,2,3]
6192; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
6193; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7]
6194; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
6195; AVX-NEXT:    # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
6196; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
6197; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
6198; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
6199; AVX-NEXT:    # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7]
6200; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
6201; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6202; AVX-NEXT:    # xmm15 = mem[0,1,1,3]
6203; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
6204; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6205; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
6206; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
6207; AVX-NEXT:    vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6208; AVX-NEXT:    # xmm15 = mem[0,1,2,0]
6209; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
6210; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
6211; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
6212; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6213; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
6214; AVX-NEXT:    # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7]
6215; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
6216; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3]
6217; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
6218; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7]
6219; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6220; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
6221; AVX-NEXT:    # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7]
6222; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
6223; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
6224; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7]
6225; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
6226; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3]
6227; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
6228; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3]
6229; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
6230; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0]
6231; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
6232; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
6233; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
6234; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6235; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6236; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6237; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
6238; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
6239; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
6240; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
6241; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
6242; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6243; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
6244; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
6245; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
6246; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
6247; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6248; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6249; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
6250; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
6251; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6252; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3]
6253; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
6254; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6255; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
6256; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
6257; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6258; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0]
6259; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
6260; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
6261; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
6262; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6263; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6264; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
6265; AVX-NEXT:    # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
6266; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
6267; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6268; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm2
6269; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
6270; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6271; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
6272; AVX-NEXT:    # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7]
6273; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
6274; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
6275; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
6276; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
6277; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
6278; AVX-NEXT:    # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
6279; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
6280; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6281; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
6282; AVX-NEXT:    # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
6283; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
6284; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
6285; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
6286; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6287; AVX-NEXT:    # xmm15 = mem[0,1,0,3]
6288; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
6289; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
6290; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
6291; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6292; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6293; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6294; AVX-NEXT:    # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
6295; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
6296; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
6297; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6298; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm5
6299; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
6300; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6301; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
6302; AVX-NEXT:    # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
6303; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
6304; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
6305; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
6306; AVX-NEXT:    # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
6307; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
6308; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
6309; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6310; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload
6311; AVX-NEXT:    # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7]
6312; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
6313; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
6314; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
6315; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6316; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3]
6317; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
6318; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
6319; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
6320; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6321; AVX-NEXT:    vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
6322; AVX-NEXT:    # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7]
6323; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
6324; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm5
6325; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
6326; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7]
6327; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
6328; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
6329; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7]
6330; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
6331; AVX-NEXT:    vmovdqa %xmm0, %xmm4
6332; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
6333; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
6334; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
6335; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
6336; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
6337; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
6338; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
6339; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
6340; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6341; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6342; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
6343; AVX-NEXT:    # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7]
6344; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
6345; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6346; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6347; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7]
6348; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
6349; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
6350; AVX-NEXT:    vpsrlq $48, %xmm7, %xmm3
6351; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
6352; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
6353; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6354; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6355; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7]
6356; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm1
6357; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6358; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6359; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7]
6360; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
6361; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7]
6362; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7]
6363; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6364; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3]
6365; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
6366; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
6367; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
6368; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6369; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6370; AVX-NEXT:    # xmm0 = mem[3,1,2,3]
6371; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
6372; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6373; AVX-NEXT:    # xmm1 = mem[0,2,2,3]
6374; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
6375; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6376; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6377; AVX-NEXT:    # xmm1 = mem[0,3,2,3]
6378; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6379; AVX-NEXT:    # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7]
6380; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6381; AVX-NEXT:    # xmm2 = mem[2,3,2,3]
6382; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
6383; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
6384; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
6385; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6386; AVX-NEXT:    # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7]
6387; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6388; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
6389; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
6390; AVX-NEXT:    # xmm5 = mem[0,2,2,3]
6391; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
6392; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
6393; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
6394; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
6395; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7]
6396; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
6397; AVX-NEXT:    # xmm5 = mem[0,1,1,3]
6398; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
6399; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7]
6400; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
6401; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6402; AVX-NEXT:    # xmm0 = mem[3,1,2,3]
6403; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
6404; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
6405; AVX-NEXT:    # xmm5 = mem[0,2,2,3]
6406; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
6407; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6408; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
6409; AVX-NEXT:    # xmm5 = mem[0,3,2,3]
6410; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
6411; AVX-NEXT:    # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
6412; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6413; AVX-NEXT:    # xmm15 = mem[2,3,2,3]
6414; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7]
6415; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
6416; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6417; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
6418; AVX-NEXT:    # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7]
6419; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6420; AVX-NEXT:    # xmm15 = mem[1,1,1,1]
6421; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
6422; AVX-NEXT:    # xmm12 = mem[0,2,2,3]
6423; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7]
6424; AVX-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1]
6425; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
6426; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7]
6427; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3]
6428; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
6429; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7]
6430; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
6431; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3]
6432; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
6433; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
6434; AVX-NEXT:    # xmm12 = mem[0,2,2,3]
6435; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7]
6436; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1]
6437; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3]
6438; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7]
6439; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
6440; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7]
6441; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7]
6442; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
6443; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1]
6444; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3]
6445; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7]
6446; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
6447; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
6448; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7]
6449; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3]
6450; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7]
6451; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
6452; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm12, %ymm12
6453; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
6454; AVX-NEXT:    # xmm5 = mem[3,1,2,3]
6455; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
6456; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
6457; AVX-NEXT:    # xmm13 = mem[0,2,2,3]
6458; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
6459; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
6460; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
6461; AVX-NEXT:    # xmm13 = mem[0,3,2,3]
6462; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload
6463; AVX-NEXT:    # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7]
6464; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
6465; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
6466; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7]
6467; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7]
6468; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6469; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload
6470; AVX-NEXT:    # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
6471; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
6472; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
6473; AVX-NEXT:    # xmm4 = mem[1,1,1,1]
6474; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
6475; AVX-NEXT:    # xmm7 = mem[0,2,2,3]
6476; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
6477; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
6478; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7]
6479; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
6480; AVX-NEXT:    # xmm4 = mem[0,1,1,3]
6481; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
6482; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
6483; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
6484; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6485; AVX-NEXT:    vmovaps %ymm3, 64(%rsi)
6486; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6487; AVX-NEXT:    vmovaps %ymm4, (%rsi)
6488; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6489; AVX-NEXT:    vmovaps %ymm4, 96(%rsi)
6490; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6491; AVX-NEXT:    vmovaps %ymm4, 32(%rsi)
6492; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6493; AVX-NEXT:    vmovaps %ymm3, 64(%rdx)
6494; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6495; AVX-NEXT:    vmovaps %ymm3, (%rdx)
6496; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6497; AVX-NEXT:    vmovaps %ymm3, 96(%rdx)
6498; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6499; AVX-NEXT:    vmovaps %ymm3, 32(%rdx)
6500; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6501; AVX-NEXT:    vmovaps %ymm3, 64(%rcx)
6502; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6503; AVX-NEXT:    vmovaps %ymm3, (%rcx)
6504; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6505; AVX-NEXT:    vmovaps %ymm3, 96(%rcx)
6506; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6507; AVX-NEXT:    vmovaps %ymm3, 32(%rcx)
6508; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6509; AVX-NEXT:    vmovaps %ymm3, 64(%r8)
6510; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6511; AVX-NEXT:    vmovaps %ymm3, (%r8)
6512; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6513; AVX-NEXT:    vmovaps %ymm3, 96(%r8)
6514; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6515; AVX-NEXT:    vmovaps %ymm3, 32(%r8)
6516; AVX-NEXT:    vmovaps %ymm2, 64(%r9)
6517; AVX-NEXT:    vmovaps %ymm12, (%r9)
6518; AVX-NEXT:    vmovaps %ymm0, 96(%r9)
6519; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
6520; AVX-NEXT:    addq $1032, %rsp # imm = 0x408
6521; AVX-NEXT:    vzeroupper
6522; AVX-NEXT:    retq
6523;
6524; AVX2-LABEL: load_i16_stride5_vf64:
6525; AVX2:       # %bb.0:
6526; AVX2-NEXT:    subq $1048, %rsp # imm = 0x418
6527; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm10
6528; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm4
6529; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm14
6530; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm11
6531; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm8
6532; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6533; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
6534; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6535; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm5
6536; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6537; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm0
6538; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6539; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm1
6540; AVX2-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
6541; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
6542; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6543; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
6544; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
6545; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
6546; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15]
6547; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
6548; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7]
6549; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
6550; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
6551; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0]
6552; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
6553; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6554; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4],ymm11[5],ymm8[6,7],ymm11[8],ymm8[9,10],ymm11[11],ymm8[12],ymm11[13],ymm8[14,15]
6555; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6556; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
6557; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
6558; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
6559; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6560; AVX2-NEXT:    vmovdqa %ymm4, %ymm8
6561; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6562; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
6563; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7]
6564; AVX2-NEXT:    vmovdqa 416(%rdi), %ymm13
6565; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
6566; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
6567; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
6568; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6569; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15]
6570; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6571; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6572; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
6573; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
6574; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm4
6575; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm15
6576; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5],ymm4[6],ymm15[7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13],ymm4[14],ymm15[15]
6577; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6578; AVX2-NEXT:    vmovdqa %ymm4, %ymm6
6579; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6580; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
6581; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7]
6582; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
6583; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
6584; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
6585; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6586; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm9
6587; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm12
6588; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm12[1,2],ymm9[3],ymm12[4],ymm9[5],ymm12[6,7],ymm9[8],ymm12[9,10],ymm9[11],ymm12[12],ymm9[13],ymm12[14,15]
6589; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6590; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6591; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
6592; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
6593; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
6594; AVX2-NEXT:    vmovdqa (%rdi), %ymm5
6595; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm7
6596; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15]
6597; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6598; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6599; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
6600; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
6601; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
6602; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
6603; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6604; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6605; AVX2-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
6606; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6607; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
6608; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6609; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6610; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6611; AVX2-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6612; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15]
6613; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
6614; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
6615; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
6616; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
6617; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
6618; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
6619; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
6620; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6621; AVX2-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
6622; AVX2-NEXT:    # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
6623; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
6624; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
6625; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15]
6626; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm8
6627; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7]
6628; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
6629; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
6630; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm11
6631; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15]
6632; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
6633; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
6634; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15]
6635; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm10
6636; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7]
6637; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
6638; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
6639; AVX2-NEXT:    vpblendvb %ymm3, %ymm8, %ymm1, %ymm6
6640; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15]
6641; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
6642; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7]
6643; AVX2-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
6644; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15]
6645; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm10
6646; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7]
6647; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
6648; AVX2-NEXT:    vpblendvb %ymm3, %ymm4, %ymm0, %ymm2
6649; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm9
6650; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm3
6651; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3]
6652; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
6653; AVX2-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
6654; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
6655; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6656; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15]
6657; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
6658; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6659; AVX2-NEXT:    vmovdqa 624(%rdi), %xmm15
6660; AVX2-NEXT:    vmovdqa 608(%rdi), %xmm12
6661; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0],xmm15[1],xmm12[2,3]
6662; AVX2-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6663; AVX2-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
6664; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
6665; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6666; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15]
6667; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
6668; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6669; AVX2-NEXT:    vmovdqa 464(%rdi), %xmm10
6670; AVX2-NEXT:    vmovdqa 448(%rdi), %xmm8
6671; AVX2-NEXT:    vpblendd {{.*#+}} xmm13 = xmm8[0],xmm10[1],xmm8[2,3]
6672; AVX2-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6673; AVX2-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
6674; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6675; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6676; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15]
6677; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7]
6678; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6679; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm5
6680; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm4
6681; AVX2-NEXT:    vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3]
6682; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6683; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6684; AVX2-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
6685; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6686; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6687; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
6688; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
6689; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6690; AVX2-NEXT:    vmovdqa %xmm3, %xmm14
6691; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6692; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3]
6693; AVX2-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6694; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
6695; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
6696; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6697; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6698; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
6699; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6700; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6701; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3]
6702; AVX2-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6703; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
6704; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6705; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
6706; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
6707; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6708; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3]
6709; AVX2-NEXT:    vmovdqa %xmm8, %xmm10
6710; AVX2-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6711; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
6712; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6713; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15]
6714; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
6715; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6716; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3]
6717; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
6718; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6719; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
6720; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6721; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6722; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6723; AVX2-NEXT:    vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload
6724; AVX2-NEXT:    # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15]
6725; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6726; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
6727; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6728; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
6729; AVX2-NEXT:    # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
6730; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
6731; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
6732; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
6733; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
6734; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
6735; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
6736; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
6737; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3]
6738; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
6739; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
6740; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6741; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
6742; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
6743; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6744; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6745; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6746; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
6747; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
6748; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
6749; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6750; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload
6751; AVX2-NEXT:    # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
6752; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
6753; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
6754; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
6755; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
6756; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
6757; AVX2-NEXT:    vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload
6758; AVX2-NEXT:    # xmm6 = mem[0],xmm12[1],mem[2,3]
6759; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
6760; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6761; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
6762; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
6763; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6764; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
6765; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6766; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
6767; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
6768; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
6769; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
6770; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6771; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
6772; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
6773; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
6774; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
6775; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
6776; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
6777; AVX2-NEXT:    vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload
6778; AVX2-NEXT:    # xmm6 = mem[0],xmm10[1],mem[2,3]
6779; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
6780; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6781; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
6782; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
6783; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6784; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6785; AVX2-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
6786; AVX2-NEXT:    # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15]
6787; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
6788; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
6789; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
6790; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6791; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6792; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
6793; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
6794; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7]
6795; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
6796; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
6797; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6798; AVX2-NEXT:    vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6799; AVX2-NEXT:    # xmm2 = mem[0],xmm2[1],mem[2,3]
6800; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
6801; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6802; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6803; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6804; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6805; AVX2-NEXT:    vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload
6806; AVX2-NEXT:    # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15]
6807; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6808; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6809; AVX2-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
6810; AVX2-NEXT:    # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15]
6811; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
6812; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
6813; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
6814; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
6815; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
6816; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
6817; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
6818; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6819; AVX2-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload
6820; AVX2-NEXT:    # xmm6 = mem[0,1],xmm15[2],mem[3]
6821; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
6822; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
6823; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6824; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
6825; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
6826; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6827; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6828; AVX2-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
6829; AVX2-NEXT:    # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15]
6830; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
6831; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
6832; AVX2-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload
6833; AVX2-NEXT:    # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15]
6834; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
6835; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
6836; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
6837; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
6838; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
6839; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6840; AVX2-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload
6841; AVX2-NEXT:    # xmm6 = mem[0,1],xmm11[2],mem[3]
6842; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
6843; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6844; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
6845; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
6846; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6847; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15]
6848; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
6849; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
6850; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
6851; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
6852; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
6853; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
6854; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
6855; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
6856; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6857; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6858; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3]
6859; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
6860; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6861; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
6862; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
6863; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6864; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
6865; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15]
6866; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
6867; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
6868; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
6869; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15]
6870; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
6871; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3]
6872; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
6873; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
6874; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6875; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6876; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3]
6877; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
6878; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6879; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6880; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6881; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6882; AVX2-NEXT:    vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
6883; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15]
6884; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6885; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7]
6886; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6887; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6888; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
6889; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
6890; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
6891; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
6892; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
6893; AVX2-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
6894; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
6895; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
6896; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
6897; AVX2-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6898; AVX2-NEXT:    # xmm2 = mem[3,1,2,3]
6899; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
6900; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
6901; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
6902; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
6903; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6904; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6905; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
6906; AVX2-NEXT:    # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15]
6907; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
6908; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
6909; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6910; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
6911; AVX2-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
6912; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
6913; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
6914; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
6915; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
6916; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
6917; AVX2-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
6918; AVX2-NEXT:    # xmm4 = mem[3,1,2,3]
6919; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
6920; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3]
6921; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
6922; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6923; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
6924; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
6925; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6926; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
6927; AVX2-NEXT:    # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
6928; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
6929; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
6930; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6931; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
6932; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
6933; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
6934; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
6935; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
6936; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
6937; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
6938; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3]
6939; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
6940; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3]
6941; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
6942; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
6943; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
6944; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
6945; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
6946; AVX2-NEXT:    # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15]
6947; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
6948; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7]
6949; AVX2-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
6950; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6951; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
6952; AVX2-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15]
6953; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
6954; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
6955; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
6956; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
6957; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3]
6958; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
6959; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3]
6960; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
6961; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
6962; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
6963; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
6964; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6965; AVX2-NEXT:    vmovaps %ymm5, 64(%rsi)
6966; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6967; AVX2-NEXT:    vmovaps %ymm5, (%rsi)
6968; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6969; AVX2-NEXT:    vmovaps %ymm5, 96(%rsi)
6970; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6971; AVX2-NEXT:    vmovaps %ymm5, 32(%rsi)
6972; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6973; AVX2-NEXT:    vmovaps %ymm5, 64(%rdx)
6974; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6975; AVX2-NEXT:    vmovaps %ymm5, (%rdx)
6976; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6977; AVX2-NEXT:    vmovaps %ymm5, 96(%rdx)
6978; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6979; AVX2-NEXT:    vmovaps %ymm5, 32(%rdx)
6980; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6981; AVX2-NEXT:    vmovaps %ymm5, 64(%rcx)
6982; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6983; AVX2-NEXT:    vmovaps %ymm5, (%rcx)
6984; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6985; AVX2-NEXT:    vmovaps %ymm5, 96(%rcx)
6986; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6987; AVX2-NEXT:    vmovaps %ymm5, 32(%rcx)
6988; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6989; AVX2-NEXT:    vmovaps %ymm5, 64(%r8)
6990; AVX2-NEXT:    vmovdqa %ymm1, (%r8)
6991; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6992; AVX2-NEXT:    vmovaps %ymm1, 96(%r8)
6993; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6994; AVX2-NEXT:    vmovaps %ymm1, 32(%r8)
6995; AVX2-NEXT:    vmovdqa %ymm4, 64(%r9)
6996; AVX2-NEXT:    vmovdqa %ymm3, (%r9)
6997; AVX2-NEXT:    vmovdqa %ymm2, 96(%r9)
6998; AVX2-NEXT:    vmovdqa %ymm0, 32(%r9)
6999; AVX2-NEXT:    addq $1048, %rsp # imm = 0x418
7000; AVX2-NEXT:    vzeroupper
7001; AVX2-NEXT:    retq
7002;
7003; AVX2-FP-LABEL: load_i16_stride5_vf64:
7004; AVX2-FP:       # %bb.0:
7005; AVX2-FP-NEXT:    subq $1080, %rsp # imm = 0x438
7006; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm13
7007; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm5
7008; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7009; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm6
7010; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7011; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm7
7012; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7013; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm8
7014; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
7015; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7016; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm15
7017; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm1
7018; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7019; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm0
7020; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7021; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15]
7022; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7023; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
7024; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
7025; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
7026; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5],ymm2[6],ymm15[7,8],ymm2[9],ymm15[10,11],ymm2[12],ymm15[13],ymm2[14],ymm15[15]
7027; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7028; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
7029; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7]
7030; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
7031; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm4
7032; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
7033; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm2
7034; AVX2-FP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
7035; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
7036; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7037; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
7038; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7]
7039; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
7040; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm6
7041; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7]
7042; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %ymm14
7043; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
7044; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
7045; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm3
7046; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm7
7047; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7048; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15]
7049; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7050; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7051; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
7052; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7]
7053; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm12
7054; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm11
7055; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15]
7056; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7057; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7058; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm6
7059; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7]
7060; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
7061; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
7062; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm2
7063; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7064; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm9
7065; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm10
7066; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
7067; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7068; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7069; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
7070; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7]
7071; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
7072; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
7073; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm5
7074; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
7075; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7076; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7077; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm6
7078; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7]
7079; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
7080; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm2
7081; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
7082; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7083; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7084; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7085; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15]
7086; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7087; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
7088; AVX2-FP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
7089; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
7090; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm3
7091; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
7092; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
7093; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
7094; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
7095; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
7096; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
7097; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7098; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
7099; AVX2-FP-NEXT:    # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
7100; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7101; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
7102; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7103; AVX2-FP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
7104; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
7105; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm8
7106; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7]
7107; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
7108; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
7109; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm3
7110; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
7111; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
7112; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
7113; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
7114; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm11
7115; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7]
7116; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
7117; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
7118; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm1
7119; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm8, %ymm0, %ymm2
7120; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
7121; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1]
7122; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
7123; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm8, %ymm7
7124; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
7125; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm11
7126; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7]
7127; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
7128; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm6, %ymm7, %ymm12
7129; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm1
7130; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm9
7131; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0],xmm1[1],xmm9[2,3]
7132; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
7133; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
7134; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
7135; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
7136; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
7137; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
7138; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7139; AVX2-FP-NEXT:    vmovdqa 624(%rdi), %xmm10
7140; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %xmm8
7141; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm10[1],xmm8[2,3]
7142; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
7143; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
7144; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7145; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
7146; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
7147; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7148; AVX2-FP-NEXT:    vmovdqa 464(%rdi), %xmm6
7149; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %xmm5
7150; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm5[0],xmm6[1],xmm5[2,3]
7151; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7152; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
7153; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
7154; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7155; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
7156; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
7157; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7158; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm11
7159; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm7
7160; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm7[0],xmm11[1],xmm7[2,3]
7161; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7162; AVX2-FP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7163; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
7164; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7165; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7166; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
7167; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
7168; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7169; AVX2-FP-NEXT:    vmovdqa %xmm9, %xmm4
7170; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7171; AVX2-FP-NEXT:    vmovdqa %xmm1, %xmm14
7172; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7173; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3]
7174; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
7175; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
7176; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7177; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7178; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
7179; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7180; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7181; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3]
7182; AVX2-FP-NEXT:    vmovdqa %xmm10, %xmm13
7183; AVX2-FP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7184; AVX2-FP-NEXT:    vmovdqa %xmm8, %xmm10
7185; AVX2-FP-NEXT:    vmovdqa %xmm8, (%rsp) # 16-byte Spill
7186; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
7187; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7188; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
7189; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
7190; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7191; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3]
7192; AVX2-FP-NEXT:    vmovdqa %xmm6, %xmm3
7193; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7194; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
7195; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7196; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
7197; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7198; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7199; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3]
7200; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
7201; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7202; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
7203; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
7204; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7205; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7206; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
7207; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15]
7208; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7209; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
7210; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
7211; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
7212; AVX2-FP-NEXT:    # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15]
7213; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
7214; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
7215; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
7216; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm0, %ymm2
7217; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
7218; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
7219; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
7220; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3]
7221; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
7222; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
7223; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7224; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
7225; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
7226; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7227; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
7228; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7229; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
7230; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
7231; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
7232; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7233; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15]
7234; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
7235; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
7236; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
7237; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
7238; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
7239; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3]
7240; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
7241; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7242; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
7243; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
7244; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7245; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7246; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
7247; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
7248; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
7249; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
7250; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7251; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7252; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
7253; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
7254; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
7255; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
7256; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
7257; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
7258; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
7259; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7260; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3]
7261; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
7262; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7263; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
7264; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
7265; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7266; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7267; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7268; AVX2-FP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15]
7269; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
7270; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
7271; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
7272; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7273; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
7274; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
7275; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm9
7276; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7]
7277; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
7278; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
7279; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7280; AVX2-FP-NEXT:    vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
7281; AVX2-FP-NEXT:    # xmm2 = mem[0],xmm2[1],mem[2,3]
7282; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
7283; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
7284; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
7285; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7286; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7287; AVX2-FP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
7288; AVX2-FP-NEXT:    # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15]
7289; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7290; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
7291; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7292; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15]
7293; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
7294; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
7295; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
7296; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
7297; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
7298; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
7299; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
7300; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7301; AVX2-FP-NEXT:    vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
7302; AVX2-FP-NEXT:    # xmm9 = xmm1[0,1],mem[2],xmm1[3]
7303; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
7304; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
7305; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7306; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
7307; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
7308; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7309; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
7310; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
7311; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
7312; AVX2-FP-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
7313; AVX2-FP-NEXT:    # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15]
7314; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
7315; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
7316; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
7317; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
7318; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
7319; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7320; AVX2-FP-NEXT:    vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload
7321; AVX2-FP-NEXT:    # xmm9 = xmm4[0,1],mem[2],xmm4[3]
7322; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
7323; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7324; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
7325; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
7326; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7327; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
7328; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
7329; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
7330; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15]
7331; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
7332; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
7333; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
7334; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
7335; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
7336; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7337; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3]
7338; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
7339; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7340; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
7341; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
7342; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7343; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
7344; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7345; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15]
7346; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
7347; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
7348; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
7349; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7350; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7351; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
7352; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm9
7353; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3]
7354; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
7355; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
7356; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7357; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7358; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3]
7359; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
7360; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
7361; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
7362; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7363; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7364; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7365; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7366; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15]
7367; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7368; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
7369; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
7370; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15]
7371; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
7372; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
7373; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
7374; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
7375; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
7376; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
7377; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
7378; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7]
7379; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
7380; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7381; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm4
7382; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
7383; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7384; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
7385; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
7386; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
7387; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7]
7388; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7389; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7390; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15]
7391; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
7392; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
7393; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7394; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
7395; AVX2-FP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
7396; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm14
7397; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7]
7398; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
7399; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
7400; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
7401; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7402; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
7403; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm8 # 16-byte Reload
7404; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
7405; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
7406; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
7407; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
7408; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm3, %xmm4
7409; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
7410; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
7411; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15]
7412; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
7413; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7]
7414; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
7415; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
7416; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
7417; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm5, %ymm5
7418; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm7, %xmm7
7419; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
7420; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
7421; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
7422; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7423; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
7424; AVX2-FP-NEXT:    # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
7425; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
7426; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7]
7427; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
7428; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7429; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
7430; AVX2-FP-NEXT:    # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
7431; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm7
7432; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7]
7433; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm5, %xmm3
7434; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
7435; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm15, %xmm3
7436; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7437; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
7438; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
7439; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
7440; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7441; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7442; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rsi)
7443; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7444; AVX2-FP-NEXT:    vmovaps %ymm1, (%rsi)
7445; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7446; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rsi)
7447; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7448; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rsi)
7449; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7450; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rdx)
7451; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7452; AVX2-FP-NEXT:    vmovaps %ymm1, (%rdx)
7453; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7454; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rdx)
7455; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7456; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rdx)
7457; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7458; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rcx)
7459; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7460; AVX2-FP-NEXT:    vmovaps %ymm1, (%rcx)
7461; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7462; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rcx)
7463; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7464; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rcx)
7465; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7466; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%r8)
7467; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7468; AVX2-FP-NEXT:    vmovaps %ymm1, (%r8)
7469; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7470; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%r8)
7471; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7472; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%r8)
7473; AVX2-FP-NEXT:    vmovdqa %ymm0, 64(%r9)
7474; AVX2-FP-NEXT:    vmovdqa %ymm4, (%r9)
7475; AVX2-FP-NEXT:    vmovdqa %ymm2, 96(%r9)
7476; AVX2-FP-NEXT:    vmovdqa %ymm6, 32(%r9)
7477; AVX2-FP-NEXT:    addq $1080, %rsp # imm = 0x438
7478; AVX2-FP-NEXT:    vzeroupper
7479; AVX2-FP-NEXT:    retq
7480;
7481; AVX2-FCP-LABEL: load_i16_stride5_vf64:
7482; AVX2-FCP:       # %bb.0:
7483; AVX2-FCP-NEXT:    subq $1000, %rsp # imm = 0x3E8
7484; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm9
7485; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7486; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm14
7487; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm11
7488; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm8
7489; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7490; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm10
7491; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7492; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm4
7493; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7494; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm3
7495; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7496; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm0
7497; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7498; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
7499; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7500; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
7501; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
7502; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
7503; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
7504; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
7505; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15]
7506; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3]
7507; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm4
7508; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
7509; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
7510; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
7511; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
7512; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7513; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
7514; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
7515; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
7516; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
7517; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4],ymm14[5],ymm11[6,7],ymm14[8],ymm11[9,10],ymm14[11],ymm11[12],ymm14[13],ymm11[14,15]
7518; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7519; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7520; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
7521; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
7522; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
7523; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7524; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm15
7525; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5],ymm15[6],ymm9[7,8],ymm15[9],ymm9[10,11],ymm15[12],ymm9[13],ymm15[14],ymm9[15]
7526; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7527; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
7528; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
7529; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
7530; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
7531; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15]
7532; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7533; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7534; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
7535; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
7536; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
7537; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
7538; AVX2-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
7539; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm9
7540; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm7
7541; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
7542; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7543; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7544; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
7545; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
7546; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
7547; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm5
7548; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm10
7549; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15]
7550; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7551; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7552; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
7553; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
7554; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
7555; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7556; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7557; AVX2-FCP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7558; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15]
7559; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
7560; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
7561; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
7562; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
7563; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7564; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7565; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
7566; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [2,0,0,0,4,7,1,6]
7567; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm3
7568; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
7569; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
7570; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm3, %ymm6
7571; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7572; AVX2-FCP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7573; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15]
7574; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
7575; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7]
7576; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
7577; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15]
7578; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm2, %ymm3
7579; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
7580; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm3, %ymm3
7581; AVX2-FCP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
7582; AVX2-FCP-NEXT:    # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15]
7583; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
7584; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7]
7585; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15]
7586; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm2, %ymm11
7587; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm11
7588; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
7589; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm11, %ymm0
7590; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
7591; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
7592; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7]
7593; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
7594; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15]
7595; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm2, %ymm2
7596; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
7597; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm4, %ymm1, %ymm1
7598; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm12
7599; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7]
7600; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm11
7601; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
7602; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
7603; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7604; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
7605; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
7606; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7607; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm10
7608; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm5, %ymm11
7609; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
7610; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7611; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
7612; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
7613; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7614; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
7615; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm11
7616; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
7617; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
7618; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
7619; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
7620; AVX2-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
7621; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm4
7622; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm5
7623; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7624; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
7625; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7626; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
7627; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
7628; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7629; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7]
7630; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm7, %ymm11
7631; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm8
7632; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7633; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
7634; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
7635; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15]
7636; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7]
7637; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7638; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm6
7639; AVX2-FCP-NEXT:    vmovdqa %ymm10, %ymm12
7640; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7641; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
7642; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
7643; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7]
7644; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7645; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm7, %ymm3
7646; AVX2-FCP-NEXT:    vmovdqa %ymm9, %ymm5
7647; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7648; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
7649; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
7650; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
7651; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7652; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm7, %ymm0
7653; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
7654; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
7655; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7656; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7657; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
7658; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7659; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15]
7660; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
7661; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
7662; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
7663; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm6
7664; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7665; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
7666; AVX2-FCP-NEXT:    # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
7667; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,0,0,5,7,2,4]
7668; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm7
7669; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
7670; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
7671; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7]
7672; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0]
7673; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
7674; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm6, %ymm11
7675; AVX2-FCP-NEXT:    vmovdqa %ymm14, %ymm8
7676; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
7677; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
7678; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7]
7679; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7680; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
7681; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7682; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15]
7683; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
7684; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
7685; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
7686; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7687; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
7688; AVX2-FCP-NEXT:    # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15]
7689; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm11
7690; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm11
7691; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7]
7692; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm6, %ymm11
7693; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm11
7694; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm13
7695; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
7696; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7]
7697; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7698; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
7699; AVX2-FCP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
7700; AVX2-FCP-NEXT:    # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15]
7701; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
7702; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
7703; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7704; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7705; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15]
7706; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm11
7707; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm11
7708; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
7709; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7]
7710; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm11
7711; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
7712; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
7713; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7]
7714; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7715; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7716; AVX2-FCP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
7717; AVX2-FCP-NEXT:    # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15]
7718; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
7719; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
7720; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
7721; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
7722; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
7723; AVX2-FCP-NEXT:    # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
7724; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm1
7725; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
7726; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
7727; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
7728; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
7729; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
7730; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7731; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7732; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15]
7733; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
7734; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
7735; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
7736; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
7737; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7738; AVX2-FCP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
7739; AVX2-FCP-NEXT:    # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15]
7740; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,5,0,2,7]
7741; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm9, %ymm6
7742; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
7743; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
7744; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
7745; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0]
7746; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
7747; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
7748; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
7749; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
7750; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
7751; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
7752; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7753; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
7754; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
7755; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
7756; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
7757; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7758; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
7759; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm9, %ymm7
7760; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
7761; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
7762; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
7763; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
7764; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
7765; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
7766; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7767; AVX2-FCP-NEXT:    vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
7768; AVX2-FCP-NEXT:    # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15]
7769; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
7770; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
7771; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
7772; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm12
7773; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm9, %ymm7
7774; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
7775; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
7776; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
7777; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7778; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm6, %ymm7
7779; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
7780; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm2
7781; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
7782; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
7783; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7784; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7785; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15]
7786; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
7787; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
7788; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
7789; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7790; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
7791; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
7792; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
7793; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
7794; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
7795; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm6, %ymm1
7796; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
7797; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
7798; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7799; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7800; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7801; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
7802; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
7803; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
7804; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
7805; AVX2-FCP-NEXT:    # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
7806; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
7807; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
7808; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,0,0,6,0,3,5]
7809; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm2
7810; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
7811; AVX2-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
7812; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
7813; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
7814; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7]
7815; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
7816; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
7817; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
7818; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
7819; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7820; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
7821; AVX2-FCP-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15]
7822; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm11
7823; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7]
7824; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload
7825; AVX2-FCP-NEXT:    # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
7826; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
7827; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm5, %ymm11
7828; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm11
7829; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7]
7830; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
7831; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm9
7832; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
7833; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
7834; AVX2-FCP-NEXT:    # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15]
7835; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7836; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
7837; AVX2-FCP-NEXT:    # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
7838; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
7839; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7]
7840; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm9
7841; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
7842; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm11
7843; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7]
7844; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm8
7845; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm8
7846; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
7847; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
7848; AVX2-FCP-NEXT:    # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15]
7849; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
7850; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7]
7851; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
7852; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload
7853; AVX2-FCP-NEXT:    # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15]
7854; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm5
7855; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
7856; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7]
7857; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm0
7858; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
7859; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
7860; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7861; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rsi)
7862; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
7863; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
7864; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7865; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rsi)
7866; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7867; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rsi)
7868; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7869; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rdx)
7870; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7871; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
7872; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7873; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rdx)
7874; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7875; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rdx)
7876; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7877; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rcx)
7878; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7879; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
7880; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7881; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rcx)
7882; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7883; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rcx)
7884; AVX2-FCP-NEXT:    vmovdqa %ymm1, 64(%r8)
7885; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7886; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r8)
7887; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7888; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%r8)
7889; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7890; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%r8)
7891; AVX2-FCP-NEXT:    vmovdqa %ymm0, 64(%r9)
7892; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%r9)
7893; AVX2-FCP-NEXT:    vmovdqa %ymm3, 96(%r9)
7894; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%r9)
7895; AVX2-FCP-NEXT:    addq $1000, %rsp # imm = 0x3E8
7896; AVX2-FCP-NEXT:    vzeroupper
7897; AVX2-FCP-NEXT:    retq
7898;
7899; AVX512-LABEL: load_i16_stride5_vf64:
7900; AVX512:       # %bb.0:
7901; AVX512-NEXT:    subq $552, %rsp # imm = 0x228
7902; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm6
7903; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm11
7904; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
7905; AVX512-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7906; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7907; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7908; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
7909; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128]
7910; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
7911; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm8
7912; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm7
7913; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15]
7914; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7915; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7916; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
7917; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
7918; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
7919; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
7920; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm19
7921; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm15
7922; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm13
7923; AVX512-NEXT:    vmovdqa 176(%rdi), %xmm12
7924; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm14
7925; AVX512-NEXT:    vmovdqa (%rdi), %ymm4
7926; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
7927; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm10
7928; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm9
7929; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
7930; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
7931; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
7932; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
7933; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
7934; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
7935; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
7936; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
7937; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
7938; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7939; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
7940; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7941; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
7942; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128]
7943; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
7944; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
7945; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
7946; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
7947; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
7948; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
7949; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
7950; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7951; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
7952; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7953; AVX512-NEXT:    vmovdqu %ymm9, (%rsp) # 32-byte Spill
7954; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
7955; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
7956; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
7957; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
7958; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7959; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7960; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
7961; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
7962; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
7963; AVX512-NEXT:    vporq %ymm0, %ymm1, %ymm28
7964; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
7965; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15]
7966; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm18
7967; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm24
7968; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
7969; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
7970; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
7971; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3]
7972; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm16
7973; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm30
7974; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
7975; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
7976; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7977; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm11
7978; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm7
7979; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3]
7980; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
7981; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
7982; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7983; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
7984; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
7985; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
7986; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
7987; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
7988; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
7989; AVX512-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
7990; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
7991; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
7992; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm12
7993; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm15
7994; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
7995; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
7996; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
7997; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
7998; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
7999; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8000; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
8001; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
8002; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
8003; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8004; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8005; AVX512-NEXT:    vmovdqa 464(%rdi), %xmm8
8006; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
8007; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
8008; AVX512-NEXT:    vmovdqa 448(%rdi), %xmm3
8009; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
8010; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8011; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8012; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8013; AVX512-NEXT:    vmovdqa %xmm11, %xmm6
8014; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
8015; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
8016; AVX512-NEXT:    vmovdqa %xmm7, %xmm9
8017; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
8018; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8019; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8020; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8021; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3]
8022; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm22
8023; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
8024; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
8025; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm23
8026; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm10
8027; AVX512-NEXT:    vmovdqa 576(%rdi), %ymm1
8028; AVX512-NEXT:    vmovdqa 608(%rdi), %ymm2
8029; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
8030; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm20
8031; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm17
8032; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
8033; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
8034; AVX512-NEXT:    vmovdqa 512(%rdi), %ymm5
8035; AVX512-NEXT:    vmovdqa 544(%rdi), %ymm13
8036; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15]
8037; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
8038; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
8039; AVX512-NEXT:    vmovdqa64 496(%rdi), %xmm21
8040; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3]
8041; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7]
8042; AVX512-NEXT:    vmovdqa 480(%rdi), %xmm7
8043; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3]
8044; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
8045; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
8046; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
8047; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
8048; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
8049; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
8050; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
8051; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
8052; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
8053; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
8054; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19))
8055; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm0
8056; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8057; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
8058; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm10
8059; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
8060; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
8061; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm3
8062; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm1
8063; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
8064; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm10
8065; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7]
8066; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
8067; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3]
8068; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
8069; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm24
8070; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3]
8071; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
8072; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
8073; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7]
8074; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8075; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
8076; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3]
8077; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm25
8078; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm4
8079; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
8080; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
8081; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
8082; AVX512-NEXT:    # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
8083; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
8084; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8085; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm4
8086; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3]
8087; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
8088; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
8089; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
8090; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15]
8091; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm26
8092; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm22
8093; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
8094; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7]
8095; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3]
8096; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm30
8097; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
8098; AVX512-NEXT:    vpsrlq $48, %xmm21, %xmm13
8099; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
8100; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
8101; AVX512-NEXT:    vpshufb %ymm13, %ymm10, %ymm10
8102; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
8103; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
8104; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm9
8105; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm7
8106; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
8107; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm14
8108; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
8109; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
8110; AVX512-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
8111; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
8112; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
8113; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
8114; AVX512-NEXT:    # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
8115; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
8116; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8117; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
8118; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm17
8119; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm10
8120; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7]
8121; AVX512-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
8122; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3]
8123; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm20
8124; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7]
8125; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm1
8126; AVX512-NEXT:    vpsrlq $48, %xmm16, %xmm11
8127; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
8128; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
8129; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
8130; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
8131; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm19
8132; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
8133; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
8134; AVX512-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
8135; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
8136; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
8137; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm5
8138; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3]
8139; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm23
8140; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
8141; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8142; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28))
8143; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8144; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8145; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
8146; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
8147; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm0
8148; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15]
8149; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
8150; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
8151; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
8152; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
8153; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm0
8154; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm5
8155; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15]
8156; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm10
8157; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
8158; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
8159; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
8160; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
8161; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
8162; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3]
8163; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm29
8164; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm28
8165; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
8166; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
8167; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2))
8168; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm8
8169; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm4
8170; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15]
8171; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm14
8172; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7]
8173; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
8174; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
8175; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm6
8176; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3]
8177; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
8178; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm5
8179; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3]
8180; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7]
8181; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
8182; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
8183; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm21
8184; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm10
8185; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
8186; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
8187; AVX512-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
8188; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
8189; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
8190; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
8191; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8192; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm2
8193; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
8194; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm22
8195; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm26
8196; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm10
8197; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7]
8198; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
8199; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3]
8200; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
8201; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
8202; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm17
8203; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
8204; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
8205; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm1
8206; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
8207; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm31
8208; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm13
8209; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7]
8210; AVX512-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
8211; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
8212; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
8213; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8214; AVX512-NEXT:    vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
8215; AVX512-NEXT:    # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
8216; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
8217; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7]
8218; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
8219; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8220; AVX512-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
8221; AVX512-NEXT:    # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15]
8222; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
8223; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
8224; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
8225; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
8226; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
8227; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm3
8228; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3]
8229; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
8230; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
8231; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0))
8232; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm23
8233; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm13
8234; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
8235; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
8236; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7]
8237; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm14
8238; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm15
8239; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
8240; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
8241; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
8242; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
8243; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
8244; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
8245; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
8246; AVX512-NEXT:    vmovdqa %ymm4, %ymm3
8247; AVX512-NEXT:    vmovdqa %ymm8, %ymm4
8248; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15]
8249; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
8250; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7]
8251; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
8252; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3]
8253; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm7
8254; AVX512-NEXT:    vmovdqa %xmm5, %xmm6
8255; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
8256; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
8257; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
8258; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm1
8259; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm2
8260; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3]
8261; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
8262; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8263; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
8264; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0))
8265; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
8266; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm10
8267; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
8268; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm9
8269; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7]
8270; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
8271; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8272; AVX512-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
8273; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
8274; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
8275; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm25
8276; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
8277; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
8278; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7]
8279; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
8280; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
8281; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
8282; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
8283; AVX512-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
8284; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
8285; AVX512-NEXT:    vpshufb %xmm8, %xmm11, %xmm11
8286; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7]
8287; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
8288; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
8289; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
8290; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3]
8291; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
8292; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
8293; AVX512-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
8294; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
8295; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
8296; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
8297; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm4, %zmm1
8298; AVX512-NEXT:    movb $7, %al
8299; AVX512-NEXT:    kmovw %eax, %k1
8300; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1}
8301; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
8302; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15]
8303; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm5
8304; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7]
8305; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
8306; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
8307; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
8308; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15]
8309; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
8310; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
8311; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm2
8312; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm4
8313; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
8314; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
8315; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
8316; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
8317; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm2
8318; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm4
8319; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3]
8320; AVX512-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
8321; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7]
8322; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
8323; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
8324; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
8325; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8326; AVX512-NEXT:    vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload
8327; AVX512-NEXT:    # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
8328; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
8329; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
8330; AVX512-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
8331; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8332; AVX512-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
8333; AVX512-NEXT:    # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
8334; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
8335; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
8336; AVX512-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
8337; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
8338; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1}
8339; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm2
8340; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm4
8341; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
8342; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
8343; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
8344; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
8345; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
8346; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8347; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
8348; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
8349; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
8350; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8351; AVX512-NEXT:    vmovaps %zmm2, (%rsi)
8352; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8353; AVX512-NEXT:    vmovaps %zmm2, 64(%rsi)
8354; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8355; AVX512-NEXT:    vmovaps %zmm2, 64(%rdx)
8356; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8357; AVX512-NEXT:    vmovaps %zmm2, (%rdx)
8358; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8359; AVX512-NEXT:    vmovaps %zmm2, 64(%rcx)
8360; AVX512-NEXT:    vmovdqa64 %zmm23, (%rcx)
8361; AVX512-NEXT:    vmovdqa64 %zmm25, 64(%r8)
8362; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8363; AVX512-NEXT:    vmovaps %zmm2, (%r8)
8364; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%r9)
8365; AVX512-NEXT:    vmovdqa64 %zmm0, (%r9)
8366; AVX512-NEXT:    addq $552, %rsp # imm = 0x228
8367; AVX512-NEXT:    vzeroupper
8368; AVX512-NEXT:    retq
8369;
8370; AVX512-FCP-LABEL: load_i16_stride5_vf64:
8371; AVX512-FCP:       # %bb.0:
8372; AVX512-FCP-NEXT:    subq $552, %rsp # imm = 0x228
8373; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
8374; AVX512-FCP-NEXT:    vmovdqa 496(%rdi), %xmm1
8375; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8376; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
8377; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
8378; AVX512-FCP-NEXT:    vmovdqa 480(%rdi), %xmm2
8379; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8380; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
8381; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
8382; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
8383; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8384; AVX512-FCP-NEXT:    vmovdqa 544(%rdi), %ymm11
8385; AVX512-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8386; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm7
8387; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8388; AVX512-FCP-NEXT:    vmovdqa 608(%rdi), %ymm8
8389; AVX512-FCP-NEXT:    vmovdqu %ymm8, (%rsp) # 32-byte Spill
8390; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm4
8391; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm5
8392; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
8393; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm28
8394; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm30
8395; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
8396; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7]
8397; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
8398; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
8399; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm6
8400; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm9
8401; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15]
8402; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm26
8403; AVX512-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8404; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm27
8405; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3]
8406; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
8407; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
8408; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm5
8409; AVX512-FCP-NEXT:    vpor %ymm5, %ymm4, %ymm4
8410; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
8411; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
8412; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
8413; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
8414; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0]
8415; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
8416; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
8417; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm7
8418; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6]
8419; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm17, %ymm7
8420; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
8421; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
8422; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
8423; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
8424; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7]
8425; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm7
8426; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8427; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm18, %ymm7
8428; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
8429; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
8430; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
8431; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4))
8432; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm4
8433; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8434; AVX512-FCP-NEXT:    vmovdqa 176(%rdi), %xmm4
8435; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
8436; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
8437; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %xmm15
8438; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
8439; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
8440; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
8441; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
8442; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15]
8443; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
8444; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
8445; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm12
8446; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
8447; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15]
8448; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
8449; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
8450; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
8451; AVX512-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
8452; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
8453; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
8454; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15]
8455; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm10, %ymm3
8456; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
8457; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
8458; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm9
8459; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15]
8460; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm25
8461; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm10
8462; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7]
8463; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
8464; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm17, %ymm3
8465; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
8466; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
8467; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
8468; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm18, %ymm2
8469; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
8470; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1))
8471; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
8472; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8473; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15]
8474; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
8475; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8476; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm29
8477; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8478; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
8479; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
8480; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
8481; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15]
8482; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm19
8483; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm22
8484; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm17, %ymm1
8485; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
8486; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
8487; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
8488; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
8489; AVX512-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm10
8490; AVX512-FCP-NEXT:    vpsrlq $48, %xmm31, %xmm0
8491; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
8492; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8493; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0]
8494; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
8495; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm5
8496; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm20, %ymm4
8497; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
8498; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
8499; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
8500; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
8501; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3]
8502; AVX512-FCP-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
8503; AVX512-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8504; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm12
8505; AVX512-FCP-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8506; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15]
8507; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm24, %ymm0
8508; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
8509; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
8510; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7]
8511; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7]
8512; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm25, %ymm4
8513; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
8514; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
8515; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10))
8516; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm10
8517; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
8518; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8519; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm18
8520; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm28
8521; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
8522; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
8523; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15]
8524; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
8525; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7]
8526; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm7
8527; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm4
8528; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15]
8529; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm4
8530; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
8531; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
8532; AVX512-FCP-NEXT:    vpor %ymm4, %ymm0, %ymm0
8533; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8534; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
8535; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm4
8536; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15]
8537; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm20, %ymm4
8538; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
8539; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8540; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
8541; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
8542; AVX512-FCP-NEXT:    vpsrlq $48, %xmm27, %xmm4
8543; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
8544; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
8545; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
8546; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
8547; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8548; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15]
8549; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm24, %ymm3
8550; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
8551; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
8552; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
8553; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm25, %ymm2
8554; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
8555; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0))
8556; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
8557; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8558; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
8559; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
8560; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
8561; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
8562; AVX512-FCP-NEXT:    vmovdqa64 %xmm15, %xmm20
8563; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3]
8564; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0]
8565; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
8566; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
8567; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm26
8568; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm24, %ymm3
8569; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
8570; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
8571; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7]
8572; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
8573; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
8574; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
8575; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
8576; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7]
8577; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
8578; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm1
8579; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
8580; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
8581; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7]
8582; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4]
8583; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
8584; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
8585; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
8586; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm29
8587; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm21
8588; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm25, %ymm3
8589; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
8590; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm0
8591; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
8592; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
8593; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
8594; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0]
8595; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
8596; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm5, %ymm14
8597; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
8598; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm14
8599; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
8600; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0))
8601; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm14, %zmm0
8602; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8603; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
8604; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm10
8605; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7]
8606; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
8607; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm8
8608; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15]
8609; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm13
8610; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm24, %ymm2
8611; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
8612; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
8613; AVX512-FCP-NEXT:    vmovdqa %xmm6, %xmm14
8614; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3]
8615; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm6
8616; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7]
8617; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
8618; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8619; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
8620; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm11
8621; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm12
8622; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15]
8623; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
8624; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7]
8625; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
8626; AVX512-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
8627; AVX512-FCP-NEXT:    # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
8628; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm16
8629; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm25, %ymm3
8630; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
8631; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
8632; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm2
8633; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
8634; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1))
8635; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm28
8636; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm15
8637; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm7
8638; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
8639; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
8640; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
8641; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
8642; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
8643; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm10
8644; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
8645; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm18, %ymm4
8646; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
8647; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
8648; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7]
8649; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
8650; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0]
8651; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
8652; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm4, %ymm5
8653; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
8654; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
8655; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm25
8656; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3]
8657; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm22
8658; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm31
8659; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
8660; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15]
8661; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm20
8662; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm30
8663; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm18, %ymm2
8664; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
8665; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
8666; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8667; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm4, %ymm1
8668; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
8669; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
8670; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
8671; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
8672; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
8673; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
8674; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0]
8675; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm27, %ymm1
8676; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
8677; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
8678; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7]
8679; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
8680; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7]
8681; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm26, %ymm3
8682; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
8683; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
8684; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
8685; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
8686; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8687; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
8688; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15]
8689; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
8690; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
8691; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
8692; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7]
8693; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm15
8694; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
8695; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15]
8696; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm5
8697; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
8698; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
8699; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
8700; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
8701; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
8702; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
8703; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3))
8704; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8705; AVX512-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
8706; AVX512-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
8707; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
8708; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7]
8709; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
8710; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
8711; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8712; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm25, %ymm9
8713; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
8714; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
8715; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm25, %zmm21
8716; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15]
8717; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
8718; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm25
8719; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
8720; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
8721; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
8722; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8723; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm12
8724; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
8725; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm4
8726; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
8727; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
8728; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4))
8729; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm4
8730; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm5
8731; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
8732; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
8733; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
8734; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm5
8735; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
8736; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm27, %ymm4
8737; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
8738; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
8739; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
8740; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm26, %ymm2
8741; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm4
8742; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
8743; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
8744; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm9 # 32-byte Reload
8745; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8746; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
8747; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
8748; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
8749; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
8750; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm24, %ymm3
8751; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
8752; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
8753; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
8754; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm24, %zmm2
8755; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15]
8756; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5]
8757; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
8758; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
8759; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
8760; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
8761; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
8762; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
8763; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
8764; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
8765; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
8766; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
8767; AVX512-FCP-NEXT:    movb $7, %al
8768; AVX512-FCP-NEXT:    kmovw %eax, %k1
8769; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
8770; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
8771; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8772; AVX512-FCP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
8773; AVX512-FCP-NEXT:    # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15]
8774; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
8775; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7]
8776; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
8777; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
8778; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
8779; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15]
8780; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
8781; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
8782; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm3
8783; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm6
8784; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15]
8785; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
8786; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
8787; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
8788; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15]
8789; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm4
8790; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
8791; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
8792; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
8793; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
8794; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
8795; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
8796; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
8797; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
8798; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
8799; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
8800; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
8801; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
8802; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8803; AVX512-FCP-NEXT:    vmovaps %zmm3, (%rsi)
8804; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8805; AVX512-FCP-NEXT:    vmovaps %zmm3, 64(%rsi)
8806; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8807; AVX512-FCP-NEXT:    vmovaps %zmm3, 64(%rdx)
8808; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8809; AVX512-FCP-NEXT:    vmovaps %zmm3, (%rdx)
8810; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
8811; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8812; AVX512-FCP-NEXT:    vmovaps %zmm3, (%rcx)
8813; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%r8)
8814; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%r8)
8815; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r9)
8816; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
8817; AVX512-FCP-NEXT:    addq $552, %rsp # imm = 0x228
8818; AVX512-FCP-NEXT:    vzeroupper
8819; AVX512-FCP-NEXT:    retq
8820;
8821; AVX512DQ-LABEL: load_i16_stride5_vf64:
8822; AVX512DQ:       # %bb.0:
8823; AVX512DQ-NEXT:    subq $552, %rsp # imm = 0x228
8824; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm6
8825; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm11
8826; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
8827; AVX512DQ-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8828; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8829; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
8830; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
8831; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128]
8832; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
8833; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm8
8834; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm7
8835; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15]
8836; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8837; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8838; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
8839; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
8840; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
8841; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
8842; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm19
8843; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm15
8844; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm13
8845; AVX512DQ-NEXT:    vmovdqa 176(%rdi), %xmm12
8846; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm14
8847; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm4
8848; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
8849; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm10
8850; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm9
8851; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
8852; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
8853; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
8854; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
8855; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
8856; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
8857; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
8858; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
8859; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
8860; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8861; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
8862; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
8863; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
8864; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128]
8865; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
8866; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
8867; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
8868; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
8869; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
8870; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
8871; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
8872; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8873; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
8874; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8875; AVX512DQ-NEXT:    vmovdqu %ymm9, (%rsp) # 32-byte Spill
8876; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
8877; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
8878; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
8879; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
8880; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8881; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8882; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
8883; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
8884; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
8885; AVX512DQ-NEXT:    vporq %ymm0, %ymm1, %ymm28
8886; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
8887; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15]
8888; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm18
8889; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm24
8890; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
8891; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
8892; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
8893; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3]
8894; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm16
8895; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm30
8896; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
8897; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
8898; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
8899; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm11
8900; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm7
8901; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3]
8902; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
8903; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
8904; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8905; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
8906; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
8907; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
8908; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
8909; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
8910; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
8911; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
8912; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
8913; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
8914; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm12
8915; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm15
8916; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
8917; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
8918; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
8919; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
8920; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
8921; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8922; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
8923; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
8924; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
8925; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8926; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8927; AVX512DQ-NEXT:    vmovdqa 464(%rdi), %xmm8
8928; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
8929; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
8930; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %xmm3
8931; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
8932; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8933; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8934; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8935; AVX512DQ-NEXT:    vmovdqa %xmm11, %xmm6
8936; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
8937; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
8938; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm9
8939; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
8940; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8941; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8942; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8943; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3]
8944; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm22
8945; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
8946; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
8947; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm23
8948; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm10
8949; AVX512DQ-NEXT:    vmovdqa 576(%rdi), %ymm1
8950; AVX512DQ-NEXT:    vmovdqa 608(%rdi), %ymm2
8951; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
8952; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
8953; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm17
8954; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
8955; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
8956; AVX512DQ-NEXT:    vmovdqa 512(%rdi), %ymm5
8957; AVX512DQ-NEXT:    vmovdqa 544(%rdi), %ymm13
8958; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15]
8959; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
8960; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
8961; AVX512DQ-NEXT:    vmovdqa64 496(%rdi), %xmm21
8962; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3]
8963; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7]
8964; AVX512DQ-NEXT:    vmovdqa 480(%rdi), %xmm7
8965; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3]
8966; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
8967; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
8968; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
8969; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
8970; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
8971; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
8972; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
8973; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
8974; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
8975; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
8976; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19))
8977; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm0
8978; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8979; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
8980; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm10
8981; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
8982; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
8983; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm3
8984; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm1
8985; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
8986; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm10
8987; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7]
8988; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
8989; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3]
8990; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
8991; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm24
8992; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3]
8993; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
8994; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
8995; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7]
8996; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8997; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
8998; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3]
8999; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm25
9000; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm4
9001; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
9002; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
9003; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
9004; AVX512DQ-NEXT:    # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
9005; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
9006; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9007; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm4
9008; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3]
9009; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
9010; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
9011; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
9012; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15]
9013; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm26
9014; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm22
9015; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
9016; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7]
9017; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3]
9018; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm30
9019; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
9020; AVX512DQ-NEXT:    vpsrlq $48, %xmm21, %xmm13
9021; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
9022; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
9023; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm10, %ymm10
9024; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
9025; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
9026; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm9
9027; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm7
9028; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
9029; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm14
9030; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
9031; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
9032; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
9033; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
9034; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
9035; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
9036; AVX512DQ-NEXT:    # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
9037; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
9038; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9039; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
9040; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm17
9041; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm10
9042; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7]
9043; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
9044; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3]
9045; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm20
9046; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7]
9047; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm1
9048; AVX512DQ-NEXT:    vpsrlq $48, %xmm16, %xmm11
9049; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
9050; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
9051; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
9052; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
9053; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm19
9054; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
9055; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
9056; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
9057; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
9058; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
9059; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm5
9060; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3]
9061; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm23
9062; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
9063; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9064; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28))
9065; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
9066; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9067; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
9068; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
9069; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm0
9070; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15]
9071; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
9072; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
9073; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
9074; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
9075; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm0
9076; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm5
9077; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15]
9078; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm10
9079; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
9080; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
9081; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
9082; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
9083; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
9084; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3]
9085; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
9086; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm28
9087; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
9088; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
9089; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2))
9090; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm8
9091; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm4
9092; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15]
9093; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm14
9094; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7]
9095; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
9096; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
9097; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm6
9098; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3]
9099; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
9100; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm5
9101; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3]
9102; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7]
9103; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
9104; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
9105; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm21
9106; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm10
9107; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
9108; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
9109; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
9110; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
9111; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
9112; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
9113; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9114; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm2
9115; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
9116; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
9117; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm26
9118; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm10
9119; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7]
9120; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
9121; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3]
9122; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
9123; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
9124; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm17
9125; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
9126; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
9127; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm1
9128; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
9129; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm31
9130; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm13
9131; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7]
9132; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
9133; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
9134; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
9135; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9136; AVX512DQ-NEXT:    vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
9137; AVX512DQ-NEXT:    # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
9138; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
9139; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7]
9140; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
9141; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9142; AVX512DQ-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
9143; AVX512DQ-NEXT:    # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15]
9144; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
9145; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
9146; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
9147; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
9148; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm1
9149; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm3
9150; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3]
9151; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
9152; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
9153; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0))
9154; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm23
9155; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm13
9156; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
9157; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
9158; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7]
9159; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm14
9160; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm15
9161; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
9162; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
9163; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
9164; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
9165; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
9166; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
9167; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
9168; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm3
9169; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm4
9170; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15]
9171; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
9172; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7]
9173; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
9174; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3]
9175; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm7
9176; AVX512DQ-NEXT:    vmovdqa %xmm5, %xmm6
9177; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
9178; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
9179; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
9180; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm1
9181; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm2
9182; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3]
9183; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
9184; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
9185; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
9186; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0))
9187; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
9188; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm10
9189; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
9190; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm9
9191; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7]
9192; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
9193; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9194; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
9195; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
9196; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
9197; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm25
9198; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
9199; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
9200; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7]
9201; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
9202; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
9203; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
9204; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
9205; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
9206; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
9207; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm11, %xmm11
9208; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7]
9209; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
9210; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
9211; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
9212; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3]
9213; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
9214; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
9215; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
9216; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
9217; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
9218; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
9219; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm4, %zmm1
9220; AVX512DQ-NEXT:    movb $7, %al
9221; AVX512DQ-NEXT:    kmovw %eax, %k1
9222; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1}
9223; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
9224; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15]
9225; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm5
9226; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7]
9227; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
9228; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
9229; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
9230; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15]
9231; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
9232; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
9233; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
9234; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm4
9235; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
9236; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
9237; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
9238; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
9239; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
9240; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm4
9241; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3]
9242; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
9243; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7]
9244; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
9245; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
9246; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
9247; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9248; AVX512DQ-NEXT:    vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload
9249; AVX512DQ-NEXT:    # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
9250; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
9251; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
9252; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
9253; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9254; AVX512DQ-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
9255; AVX512DQ-NEXT:    # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
9256; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
9257; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
9258; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
9259; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
9260; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1}
9261; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
9262; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm4
9263; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
9264; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
9265; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
9266; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
9267; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
9268; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9269; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
9270; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
9271; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
9272; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9273; AVX512DQ-NEXT:    vmovaps %zmm2, (%rsi)
9274; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9275; AVX512DQ-NEXT:    vmovaps %zmm2, 64(%rsi)
9276; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9277; AVX512DQ-NEXT:    vmovaps %zmm2, 64(%rdx)
9278; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9279; AVX512DQ-NEXT:    vmovaps %zmm2, (%rdx)
9280; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9281; AVX512DQ-NEXT:    vmovaps %zmm2, 64(%rcx)
9282; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%rcx)
9283; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 64(%r8)
9284; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9285; AVX512DQ-NEXT:    vmovaps %zmm2, (%r8)
9286; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%r9)
9287; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%r9)
9288; AVX512DQ-NEXT:    addq $552, %rsp # imm = 0x228
9289; AVX512DQ-NEXT:    vzeroupper
9290; AVX512DQ-NEXT:    retq
9291;
9292; AVX512DQ-FCP-LABEL: load_i16_stride5_vf64:
9293; AVX512DQ-FCP:       # %bb.0:
9294; AVX512DQ-FCP-NEXT:    subq $552, %rsp # imm = 0x228
9295; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
9296; AVX512DQ-FCP-NEXT:    vmovdqa 496(%rdi), %xmm1
9297; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9298; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
9299; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
9300; AVX512DQ-FCP-NEXT:    vmovdqa 480(%rdi), %xmm2
9301; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9302; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
9303; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9304; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
9305; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9306; AVX512DQ-FCP-NEXT:    vmovdqa 544(%rdi), %ymm11
9307; AVX512DQ-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9308; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm7
9309; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9310; AVX512DQ-FCP-NEXT:    vmovdqa 608(%rdi), %ymm8
9311; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, (%rsp) # 32-byte Spill
9312; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm4
9313; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm5
9314; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
9315; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm28
9316; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm30
9317; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
9318; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7]
9319; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
9320; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
9321; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm6
9322; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm9
9323; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15]
9324; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm26
9325; AVX512DQ-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9326; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm27
9327; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3]
9328; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
9329; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
9330; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm5
9331; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm4, %ymm4
9332; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
9333; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
9334; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
9335; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
9336; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0]
9337; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
9338; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
9339; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm7
9340; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6]
9341; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm17, %ymm7
9342; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
9343; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
9344; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
9345; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
9346; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7]
9347; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm7
9348; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9349; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm18, %ymm7
9350; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
9351; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
9352; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
9353; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4))
9354; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm4
9355; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9356; AVX512DQ-FCP-NEXT:    vmovdqa 176(%rdi), %xmm4
9357; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
9358; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
9359; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %xmm15
9360; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
9361; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
9362; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
9363; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
9364; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15]
9365; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
9366; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
9367; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm12
9368; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
9369; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15]
9370; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
9371; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
9372; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
9373; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
9374; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
9375; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
9376; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15]
9377; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm10, %ymm3
9378; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
9379; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
9380; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm9
9381; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15]
9382; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm25
9383; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm10
9384; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7]
9385; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
9386; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm17, %ymm3
9387; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
9388; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
9389; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
9390; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm18, %ymm2
9391; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
9392; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1))
9393; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
9394; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9395; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15]
9396; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
9397; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9398; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm29
9399; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9400; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
9401; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
9402; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
9403; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15]
9404; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm19
9405; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm22
9406; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm17, %ymm1
9407; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
9408; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
9409; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
9410; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
9411; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm10
9412; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm31, %xmm0
9413; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
9414; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9415; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0]
9416; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
9417; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm5
9418; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm20, %ymm4
9419; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
9420; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
9421; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
9422; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
9423; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3]
9424; AVX512DQ-FCP-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
9425; AVX512DQ-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9426; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm12
9427; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9428; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15]
9429; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm24, %ymm0
9430; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
9431; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
9432; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7]
9433; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7]
9434; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm25, %ymm4
9435; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
9436; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
9437; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10))
9438; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm10
9439; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
9440; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9441; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm18
9442; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm28
9443; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
9444; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
9445; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15]
9446; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
9447; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7]
9448; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm7
9449; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm4
9450; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15]
9451; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm4
9452; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
9453; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
9454; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm0, %ymm0
9455; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9456; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
9457; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm4
9458; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15]
9459; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm20, %ymm4
9460; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
9461; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9462; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
9463; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
9464; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm27, %xmm4
9465; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
9466; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
9467; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9468; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
9469; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9470; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15]
9471; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm24, %ymm3
9472; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
9473; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
9474; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
9475; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm25, %ymm2
9476; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
9477; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0))
9478; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
9479; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9480; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
9481; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
9482; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
9483; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
9484; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm15, %xmm20
9485; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3]
9486; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0]
9487; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
9488; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
9489; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm26
9490; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm24, %ymm3
9491; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
9492; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
9493; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7]
9494; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9495; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
9496; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
9497; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9498; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7]
9499; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
9500; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm1
9501; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
9502; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
9503; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7]
9504; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4]
9505; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
9506; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
9507; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
9508; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm29
9509; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm21
9510; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm25, %ymm3
9511; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
9512; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm0
9513; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
9514; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
9515; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
9516; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0]
9517; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
9518; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm5, %ymm14
9519; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
9520; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm14
9521; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
9522; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0))
9523; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm14, %zmm0
9524; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9525; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
9526; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm10
9527; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7]
9528; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
9529; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm8
9530; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15]
9531; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm13
9532; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm24, %ymm2
9533; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
9534; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
9535; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, %xmm14
9536; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3]
9537; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm6
9538; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7]
9539; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
9540; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9541; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
9542; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm11
9543; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm12
9544; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15]
9545; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
9546; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7]
9547; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
9548; AVX512DQ-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
9549; AVX512DQ-FCP-NEXT:    # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
9550; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm16
9551; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm25, %ymm3
9552; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
9553; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
9554; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm2
9555; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
9556; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1))
9557; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm28
9558; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm15
9559; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm7
9560; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
9561; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
9562; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
9563; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
9564; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
9565; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm10
9566; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
9567; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm18, %ymm4
9568; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
9569; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
9570; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7]
9571; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
9572; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0]
9573; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
9574; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm4, %ymm5
9575; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
9576; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
9577; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm25
9578; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3]
9579; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm22
9580; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm31
9581; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
9582; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15]
9583; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm20
9584; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm30
9585; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm18, %ymm2
9586; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
9587; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
9588; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9589; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm4, %ymm1
9590; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
9591; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
9592; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
9593; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
9594; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
9595; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
9596; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0]
9597; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm27, %ymm1
9598; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
9599; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
9600; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7]
9601; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
9602; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7]
9603; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm26, %ymm3
9604; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
9605; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
9606; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
9607; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
9608; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9609; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
9610; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15]
9611; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
9612; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
9613; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
9614; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7]
9615; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm15
9616; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
9617; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15]
9618; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm5
9619; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
9620; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
9621; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
9622; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
9623; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
9624; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
9625; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3))
9626; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9627; AVX512DQ-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
9628; AVX512DQ-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
9629; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
9630; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7]
9631; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
9632; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
9633; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
9634; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm25, %ymm9
9635; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
9636; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
9637; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm25, %zmm21
9638; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15]
9639; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
9640; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm25
9641; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
9642; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
9643; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
9644; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9645; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm12
9646; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
9647; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm4
9648; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
9649; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
9650; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4))
9651; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm4
9652; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm5
9653; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
9654; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
9655; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
9656; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm5
9657; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
9658; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm27, %ymm4
9659; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
9660; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
9661; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
9662; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm26, %ymm2
9663; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm4
9664; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
9665; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
9666; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm9 # 32-byte Reload
9667; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9668; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
9669; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
9670; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
9671; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
9672; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm24, %ymm3
9673; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
9674; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
9675; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
9676; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm24, %zmm2
9677; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15]
9678; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5]
9679; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
9680; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
9681; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
9682; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
9683; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
9684; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
9685; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
9686; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
9687; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
9688; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
9689; AVX512DQ-FCP-NEXT:    movb $7, %al
9690; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
9691; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
9692; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
9693; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
9694; AVX512DQ-FCP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
9695; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15]
9696; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
9697; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7]
9698; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
9699; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
9700; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
9701; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15]
9702; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
9703; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
9704; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm3
9705; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm6
9706; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15]
9707; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
9708; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
9709; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
9710; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15]
9711; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm4
9712; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
9713; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
9714; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
9715; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
9716; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
9717; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
9718; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
9719; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
9720; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
9721; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
9722; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
9723; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
9724; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9725; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rsi)
9726; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9727; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rsi)
9728; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9729; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rdx)
9730; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9731; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rdx)
9732; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
9733; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9734; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rcx)
9735; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%r8)
9736; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%r8)
9737; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r9)
9738; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
9739; AVX512DQ-FCP-NEXT:    addq $552, %rsp # imm = 0x228
9740; AVX512DQ-FCP-NEXT:    vzeroupper
9741; AVX512DQ-FCP-NEXT:    retq
9742;
9743; AVX512BW-LABEL: load_i16_stride5_vf64:
9744; AVX512BW:       # %bb.0:
9745; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm2
9746; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm4
9747; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm3
9748; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm5
9749; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm0
9750; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm7
9751; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm9
9752; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm11
9753; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
9754; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm10
9755; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
9756; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
9757; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
9758; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm12, %zmm13
9759; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
9760; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8
9761; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm6, %zmm8
9762; AVX512BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
9763; AVX512BW-NEXT:    kmovd %eax, %k1
9764; AVX512BW-NEXT:    vmovdqu16 %zmm13, %zmm8 {%k1}
9765; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
9766; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm13, %zmm8
9767; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm12
9768; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm6
9769; AVX512BW-NEXT:    vmovdqu16 %zmm12, %zmm6 {%k1}
9770; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm13, %zmm6
9771; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
9772; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
9773; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15
9774; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm14, %zmm15
9775; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
9776; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
9777; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm13
9778; AVX512BW-NEXT:    vmovdqu16 %zmm15, %zmm13 {%k1}
9779; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
9780; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm15, %zmm13
9781; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm5, %zmm14
9782; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm12
9783; AVX512BW-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
9784; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm15, %zmm12
9785; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
9786; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
9787; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm17
9788; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
9789; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
9790; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm15
9791; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
9792; AVX512BW-NEXT:    vmovdqu16 %zmm17, %zmm15 {%k1}
9793; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
9794; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm17, %zmm15
9795; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm5, %zmm16
9796; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm14
9797; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
9798; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm17, %zmm14
9799; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
9800; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
9801; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
9802; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
9803; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
9804; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm19
9805; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
9806; AVX512BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
9807; AVX512BW-NEXT:    kmovd %eax, %k1
9808; AVX512BW-NEXT:    vmovdqu16 %zmm17, %zmm19 {%k1}
9809; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
9810; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm17, %zmm19
9811; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm16
9812; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm18
9813; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm18 {%k1}
9814; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm17, %zmm18
9815; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
9816; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm16, %zmm9
9817; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
9818; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
9819; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm11, %zmm1
9820; AVX512BW-NEXT:    movb $7, %al
9821; AVX512BW-NEXT:    kmovd %eax, %k1
9822; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
9823; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
9824; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm9, %zmm1
9825; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm11, %zmm0
9826; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm16, %zmm3
9827; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
9828; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm9, %zmm0
9829; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%rsi)
9830; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
9831; AVX512BW-NEXT:    vmovdqa64 %zmm12, 64(%rdx)
9832; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%rdx)
9833; AVX512BW-NEXT:    vmovdqa64 %zmm14, 64(%rcx)
9834; AVX512BW-NEXT:    vmovdqa64 %zmm15, (%rcx)
9835; AVX512BW-NEXT:    vmovdqa64 %zmm18, 64(%r8)
9836; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%r8)
9837; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%r9)
9838; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%r9)
9839; AVX512BW-NEXT:    vzeroupper
9840; AVX512BW-NEXT:    retq
9841;
9842; AVX512BW-FCP-LABEL: load_i16_stride5_vf64:
9843; AVX512BW-FCP:       # %bb.0:
9844; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm2
9845; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm4
9846; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm3
9847; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm5
9848; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
9849; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm7
9850; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm9
9851; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
9852; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
9853; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm10
9854; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
9855; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
9856; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13
9857; AVX512BW-FCP-NEXT:    vpermt2w %zmm10, %zmm12, %zmm13
9858; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
9859; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm8
9860; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm6, %zmm8
9861; AVX512BW-FCP-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
9862; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
9863; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm8 {%k1}
9864; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
9865; AVX512BW-FCP-NEXT:    vpermt2w %zmm7, %zmm13, %zmm8
9866; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm12
9867; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm6
9868; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm12, %zmm6 {%k1}
9869; AVX512BW-FCP-NEXT:    vpermt2w %zmm2, %zmm13, %zmm6
9870; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
9871; AVX512BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
9872; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm15
9873; AVX512BW-FCP-NEXT:    vpermt2w %zmm1, %zmm14, %zmm15
9874; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
9875; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm13
9876; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm12, %zmm13
9877; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm15, %zmm13 {%k1}
9878; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
9879; AVX512BW-FCP-NEXT:    vpermt2w %zmm7, %zmm15, %zmm13
9880; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm5, %zmm14
9881; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm12
9882; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
9883; AVX512BW-FCP-NEXT:    vpermt2w %zmm2, %zmm15, %zmm12
9884; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
9885; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
9886; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
9887; AVX512BW-FCP-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
9888; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
9889; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm15
9890; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
9891; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm15 {%k1}
9892; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
9893; AVX512BW-FCP-NEXT:    vpermt2w %zmm7, %zmm17, %zmm15
9894; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm5, %zmm16
9895; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm14
9896; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
9897; AVX512BW-FCP-NEXT:    vpermt2w %zmm2, %zmm17, %zmm14
9898; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
9899; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
9900; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
9901; AVX512BW-FCP-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
9902; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
9903; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
9904; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
9905; AVX512BW-FCP-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
9906; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
9907; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm19 {%k1}
9908; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
9909; AVX512BW-FCP-NEXT:    vpermt2w %zmm7, %zmm17, %zmm19
9910; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm16
9911; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm18
9912; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm18 {%k1}
9913; AVX512BW-FCP-NEXT:    vpermt2w %zmm2, %zmm17, %zmm18
9914; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
9915; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm16, %zmm9
9916; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
9917; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
9918; AVX512BW-FCP-NEXT:    vpermt2w %zmm10, %zmm11, %zmm1
9919; AVX512BW-FCP-NEXT:    movb $7, %al
9920; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
9921; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
9922; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
9923; AVX512BW-FCP-NEXT:    vpermt2w %zmm7, %zmm9, %zmm1
9924; AVX512BW-FCP-NEXT:    vpermt2w %zmm5, %zmm11, %zmm0
9925; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm16, %zmm3
9926; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
9927; AVX512BW-FCP-NEXT:    vpermt2w %zmm2, %zmm9, %zmm0
9928; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rsi)
9929; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
9930; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, 64(%rdx)
9931; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%rdx)
9932; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rcx)
9933; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
9934; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 64(%r8)
9935; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, (%r8)
9936; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r9)
9937; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
9938; AVX512BW-FCP-NEXT:    vzeroupper
9939; AVX512BW-FCP-NEXT:    retq
9940;
9941; AVX512DQ-BW-LABEL: load_i16_stride5_vf64:
9942; AVX512DQ-BW:       # %bb.0:
9943; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm2
9944; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm4
9945; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm3
9946; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm5
9947; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm0
9948; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm7
9949; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm9
9950; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm11
9951; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
9952; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm10
9953; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
9954; AVX512DQ-BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
9955; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm13
9956; AVX512DQ-BW-NEXT:    vpermt2w %zmm10, %zmm12, %zmm13
9957; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
9958; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm8
9959; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm6, %zmm8
9960; AVX512DQ-BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
9961; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
9962; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm13, %zmm8 {%k1}
9963; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
9964; AVX512DQ-BW-NEXT:    vpermt2w %zmm7, %zmm13, %zmm8
9965; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm12
9966; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm6
9967; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm12, %zmm6 {%k1}
9968; AVX512DQ-BW-NEXT:    vpermt2w %zmm2, %zmm13, %zmm6
9969; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
9970; AVX512DQ-BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
9971; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm15
9972; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm14, %zmm15
9973; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
9974; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm13
9975; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm13
9976; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm15, %zmm13 {%k1}
9977; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
9978; AVX512DQ-BW-NEXT:    vpermt2w %zmm7, %zmm15, %zmm13
9979; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm5, %zmm14
9980; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm12
9981; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
9982; AVX512DQ-BW-NEXT:    vpermt2w %zmm2, %zmm15, %zmm12
9983; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
9984; AVX512DQ-BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
9985; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm17
9986; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
9987; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
9988; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm15
9989; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
9990; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm17, %zmm15 {%k1}
9991; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
9992; AVX512DQ-BW-NEXT:    vpermt2w %zmm7, %zmm17, %zmm15
9993; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm5, %zmm16
9994; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm14
9995; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
9996; AVX512DQ-BW-NEXT:    vpermt2w %zmm2, %zmm17, %zmm14
9997; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
9998; AVX512DQ-BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
9999; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm17
10000; AVX512DQ-BW-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
10001; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
10002; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm19
10003; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
10004; AVX512DQ-BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
10005; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
10006; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm17, %zmm19 {%k1}
10007; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
10008; AVX512DQ-BW-NEXT:    vpermt2w %zmm7, %zmm17, %zmm19
10009; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm16
10010; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm18
10011; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm18 {%k1}
10012; AVX512DQ-BW-NEXT:    vpermt2w %zmm2, %zmm17, %zmm18
10013; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
10014; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm16, %zmm9
10015; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
10016; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10017; AVX512DQ-BW-NEXT:    vpermt2w %zmm10, %zmm11, %zmm1
10018; AVX512DQ-BW-NEXT:    movb $7, %al
10019; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
10020; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
10021; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
10022; AVX512DQ-BW-NEXT:    vpermt2w %zmm7, %zmm9, %zmm1
10023; AVX512DQ-BW-NEXT:    vpermt2w %zmm5, %zmm11, %zmm0
10024; AVX512DQ-BW-NEXT:    vpermt2w %zmm4, %zmm16, %zmm3
10025; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
10026; AVX512DQ-BW-NEXT:    vpermt2w %zmm2, %zmm9, %zmm0
10027; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 64(%rsi)
10028; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
10029; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, 64(%rdx)
10030; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, (%rdx)
10031; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 64(%rcx)
10032; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, (%rcx)
10033; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 64(%r8)
10034; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, (%r8)
10035; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%r9)
10036; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%r9)
10037; AVX512DQ-BW-NEXT:    vzeroupper
10038; AVX512DQ-BW-NEXT:    retq
10039;
10040; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf64:
10041; AVX512DQ-BW-FCP:       # %bb.0:
10042; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm2
10043; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm4
10044; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm3
10045; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm5
10046; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm0
10047; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm7
10048; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm9
10049; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
10050; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
10051; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm10
10052; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
10053; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
10054; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13
10055; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm10, %zmm12, %zmm13
10056; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0]
10057; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm8
10058; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm6, %zmm8
10059; AVX512DQ-BW-FCP-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
10060; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
10061; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm8 {%k1}
10062; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
10063; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm7, %zmm13, %zmm8
10064; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm12
10065; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm6
10066; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm12, %zmm6 {%k1}
10067; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm2, %zmm13, %zmm6
10068; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
10069; AVX512DQ-BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
10070; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm15
10071; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm1, %zmm14, %zmm15
10072; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0]
10073; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm13
10074; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm12, %zmm13
10075; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm15, %zmm13 {%k1}
10076; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
10077; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm7, %zmm15, %zmm13
10078; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm5, %zmm14
10079; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm12
10080; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
10081; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm2, %zmm15, %zmm12
10082; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
10083; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
10084; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
10085; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
10086; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0]
10087; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm15
10088; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
10089; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm15 {%k1}
10090; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
10091; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm7, %zmm17, %zmm15
10092; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm5, %zmm16
10093; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm14
10094; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
10095; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm2, %zmm17, %zmm14
10096; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
10097; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
10098; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
10099; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
10100; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0]
10101; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
10102; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
10103; AVX512DQ-BW-FCP-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
10104; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
10105; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm19 {%k1}
10106; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
10107; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm7, %zmm17, %zmm19
10108; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm16
10109; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm18
10110; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm18 {%k1}
10111; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm2, %zmm17, %zmm18
10112; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0]
10113; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm16, %zmm9
10114; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
10115; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10116; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm10, %zmm11, %zmm1
10117; AVX512DQ-BW-FCP-NEXT:    movb $7, %al
10118; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
10119; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
10120; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
10121; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm7, %zmm9, %zmm1
10122; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm5, %zmm11, %zmm0
10123; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm16, %zmm3
10124; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
10125; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm2, %zmm9, %zmm0
10126; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rsi)
10127; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
10128; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, 64(%rdx)
10129; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%rdx)
10130; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rcx)
10131; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
10132; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 64(%r8)
10133; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, (%r8)
10134; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r9)
10135; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
10136; AVX512DQ-BW-FCP-NEXT:    vzeroupper
10137; AVX512DQ-BW-FCP-NEXT:    retq
10138  %wide.vec = load <320 x i16>, ptr %in.vec, align 64
10139  %strided.vec0 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155, i32 160, i32 165, i32 170, i32 175, i32 180, i32 185, i32 190, i32 195, i32 200, i32 205, i32 210, i32 215, i32 220, i32 225, i32 230, i32 235, i32 240, i32 245, i32 250, i32 255, i32 260, i32 265, i32 270, i32 275, i32 280, i32 285, i32 290, i32 295, i32 300, i32 305, i32 310, i32 315>
10140  %strided.vec1 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156, i32 161, i32 166, i32 171, i32 176, i32 181, i32 186, i32 191, i32 196, i32 201, i32 206, i32 211, i32 216, i32 221, i32 226, i32 231, i32 236, i32 241, i32 246, i32 251, i32 256, i32 261, i32 266, i32 271, i32 276, i32 281, i32 286, i32 291, i32 296, i32 301, i32 306, i32 311, i32 316>
10141  %strided.vec2 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157, i32 162, i32 167, i32 172, i32 177, i32 182, i32 187, i32 192, i32 197, i32 202, i32 207, i32 212, i32 217, i32 222, i32 227, i32 232, i32 237, i32 242, i32 247, i32 252, i32 257, i32 262, i32 267, i32 272, i32 277, i32 282, i32 287, i32 292, i32 297, i32 302, i32 307, i32 312, i32 317>
10142  %strided.vec3 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158, i32 163, i32 168, i32 173, i32 178, i32 183, i32 188, i32 193, i32 198, i32 203, i32 208, i32 213, i32 218, i32 223, i32 228, i32 233, i32 238, i32 243, i32 248, i32 253, i32 258, i32 263, i32 268, i32 273, i32 278, i32 283, i32 288, i32 293, i32 298, i32 303, i32 308, i32 313, i32 318>
10143  %strided.vec4 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159, i32 164, i32 169, i32 174, i32 179, i32 184, i32 189, i32 194, i32 199, i32 204, i32 209, i32 214, i32 219, i32 224, i32 229, i32 234, i32 239, i32 244, i32 249, i32 254, i32 259, i32 264, i32 269, i32 274, i32 279, i32 284, i32 289, i32 294, i32 299, i32 304, i32 309, i32 314, i32 319>
10144  store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
10145  store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
10146  store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
10147  store <64 x i16> %strided.vec3, ptr %out.vec3, align 64
10148  store <64 x i16> %strided.vec4, ptr %out.vec4, align 64
10149  ret void
10150}
10151