xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll (revision 7457f51f6cf61b960e3e6e45e63378debd5c1d5c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
19; SSE-LABEL: load_i16_stride6_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movdqa (%rdi), %xmm0
23; SSE-NEXT:    movdqa 16(%rdi), %xmm1
24; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
25; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
26; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
27; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
28; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
29; SSE-NEXT:    movdqa %xmm0, %xmm5
30; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
31; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
32; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
33; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1]
34; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
35; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
36; SSE-NEXT:    psrlq $48, %xmm1
37; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
38; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
39; SSE-NEXT:    movd %xmm3, (%rsi)
40; SSE-NEXT:    movd %xmm2, (%rdx)
41; SSE-NEXT:    movd %xmm4, (%rcx)
42; SSE-NEXT:    movd %xmm5, (%r8)
43; SSE-NEXT:    movd %xmm7, (%r9)
44; SSE-NEXT:    movd %xmm0, (%rax)
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: load_i16_stride6_vf2:
48; AVX:       # %bb.0:
49; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
50; AVX-NEXT:    vmovdqa (%rdi), %xmm0
51; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
52; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
53; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
54; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
55; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
56; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
57; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
58; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
59; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
60; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1]
61; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
62; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
63; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm1
64; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
65; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
66; AVX-NEXT:    vmovd %xmm3, (%rsi)
67; AVX-NEXT:    vmovd %xmm2, (%rdx)
68; AVX-NEXT:    vmovd %xmm4, (%rcx)
69; AVX-NEXT:    vmovd %xmm5, (%r8)
70; AVX-NEXT:    vmovd %xmm6, (%r9)
71; AVX-NEXT:    vmovd %xmm0, (%rax)
72; AVX-NEXT:    retq
73;
74; AVX2-LABEL: load_i16_stride6_vf2:
75; AVX2:       # %bb.0:
76; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
77; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
78; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
79; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
80; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
81; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
82; AVX2-NEXT:    vpbroadcastw 4(%rdi), %xmm4
83; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
84; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
85; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
86; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
87; AVX2-NEXT:    vpbroadcastw 20(%rdi), %xmm6
88; AVX2-NEXT:    vpbroadcastw 8(%rdi), %xmm7
89; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
90; AVX2-NEXT:    vpsrlq $48, %xmm1, %xmm1
91; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
92; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
93; AVX2-NEXT:    vmovd %xmm3, (%rsi)
94; AVX2-NEXT:    vmovd %xmm2, (%rdx)
95; AVX2-NEXT:    vmovd %xmm4, (%rcx)
96; AVX2-NEXT:    vmovd %xmm5, (%r8)
97; AVX2-NEXT:    vmovd %xmm6, (%r9)
98; AVX2-NEXT:    vmovd %xmm0, (%rax)
99; AVX2-NEXT:    retq
100;
101; AVX2-FP-LABEL: load_i16_stride6_vf2:
102; AVX2-FP:       # %bb.0:
103; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
104; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
105; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
106; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
107; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
108; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
109; AVX2-FP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
110; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
111; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
112; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
113; AVX2-FP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
114; AVX2-FP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
115; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
116; AVX2-FP-NEXT:    vpsrlq $48, %xmm1, %xmm1
117; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
118; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
119; AVX2-FP-NEXT:    vmovd %xmm3, (%rsi)
120; AVX2-FP-NEXT:    vmovd %xmm2, (%rdx)
121; AVX2-FP-NEXT:    vmovd %xmm4, (%rcx)
122; AVX2-FP-NEXT:    vmovd %xmm5, (%r8)
123; AVX2-FP-NEXT:    vmovd %xmm6, (%r9)
124; AVX2-FP-NEXT:    vmovd %xmm0, (%rax)
125; AVX2-FP-NEXT:    retq
126;
127; AVX2-FCP-LABEL: load_i16_stride6_vf2:
128; AVX2-FCP:       # %bb.0:
129; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
130; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
131; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
132; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
133; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
134; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
135; AVX2-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
136; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
137; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
138; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
139; AVX2-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
140; AVX2-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
141; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
142; AVX2-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm1
143; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
144; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
145; AVX2-FCP-NEXT:    vmovd %xmm3, (%rsi)
146; AVX2-FCP-NEXT:    vmovd %xmm2, (%rdx)
147; AVX2-FCP-NEXT:    vmovd %xmm4, (%rcx)
148; AVX2-FCP-NEXT:    vmovd %xmm5, (%r8)
149; AVX2-FCP-NEXT:    vmovd %xmm6, (%r9)
150; AVX2-FCP-NEXT:    vmovd %xmm0, (%rax)
151; AVX2-FCP-NEXT:    retq
152;
153; AVX512-LABEL: load_i16_stride6_vf2:
154; AVX512:       # %bb.0:
155; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
156; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
157; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
158; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
159; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
160; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
161; AVX512-NEXT:    vpbroadcastw 4(%rdi), %xmm4
162; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
163; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
164; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
165; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
166; AVX512-NEXT:    vpbroadcastw 20(%rdi), %xmm6
167; AVX512-NEXT:    vpbroadcastw 8(%rdi), %xmm7
168; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
169; AVX512-NEXT:    vpsrlq $48, %xmm1, %xmm1
170; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
171; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
172; AVX512-NEXT:    vmovd %xmm3, (%rsi)
173; AVX512-NEXT:    vmovd %xmm2, (%rdx)
174; AVX512-NEXT:    vmovd %xmm4, (%rcx)
175; AVX512-NEXT:    vmovd %xmm5, (%r8)
176; AVX512-NEXT:    vmovd %xmm6, (%r9)
177; AVX512-NEXT:    vmovd %xmm0, (%rax)
178; AVX512-NEXT:    retq
179;
180; AVX512-FCP-LABEL: load_i16_stride6_vf2:
181; AVX512-FCP:       # %bb.0:
182; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
183; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
184; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
185; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
186; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
187; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
188; AVX512-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
189; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
190; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
191; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
192; AVX512-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
193; AVX512-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
194; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
195; AVX512-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm1
196; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
197; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
198; AVX512-FCP-NEXT:    vmovd %xmm3, (%rsi)
199; AVX512-FCP-NEXT:    vmovd %xmm2, (%rdx)
200; AVX512-FCP-NEXT:    vmovd %xmm4, (%rcx)
201; AVX512-FCP-NEXT:    vmovd %xmm5, (%r8)
202; AVX512-FCP-NEXT:    vmovd %xmm6, (%r9)
203; AVX512-FCP-NEXT:    vmovd %xmm0, (%rax)
204; AVX512-FCP-NEXT:    retq
205;
206; AVX512DQ-LABEL: load_i16_stride6_vf2:
207; AVX512DQ:       # %bb.0:
208; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
209; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
210; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
211; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
212; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
213; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
214; AVX512DQ-NEXT:    vpbroadcastw 4(%rdi), %xmm4
215; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
216; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
217; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
218; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
219; AVX512DQ-NEXT:    vpbroadcastw 20(%rdi), %xmm6
220; AVX512DQ-NEXT:    vpbroadcastw 8(%rdi), %xmm7
221; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
222; AVX512DQ-NEXT:    vpsrlq $48, %xmm1, %xmm1
223; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
224; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
225; AVX512DQ-NEXT:    vmovd %xmm3, (%rsi)
226; AVX512DQ-NEXT:    vmovd %xmm2, (%rdx)
227; AVX512DQ-NEXT:    vmovd %xmm4, (%rcx)
228; AVX512DQ-NEXT:    vmovd %xmm5, (%r8)
229; AVX512DQ-NEXT:    vmovd %xmm6, (%r9)
230; AVX512DQ-NEXT:    vmovd %xmm0, (%rax)
231; AVX512DQ-NEXT:    retq
232;
233; AVX512DQ-FCP-LABEL: load_i16_stride6_vf2:
234; AVX512DQ-FCP:       # %bb.0:
235; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
236; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
237; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
238; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
239; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
240; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
241; AVX512DQ-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
242; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
243; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
244; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
245; AVX512DQ-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
246; AVX512DQ-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
247; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
248; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm1
249; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
250; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
251; AVX512DQ-FCP-NEXT:    vmovd %xmm3, (%rsi)
252; AVX512DQ-FCP-NEXT:    vmovd %xmm2, (%rdx)
253; AVX512DQ-FCP-NEXT:    vmovd %xmm4, (%rcx)
254; AVX512DQ-FCP-NEXT:    vmovd %xmm5, (%r8)
255; AVX512DQ-FCP-NEXT:    vmovd %xmm6, (%r9)
256; AVX512DQ-FCP-NEXT:    vmovd %xmm0, (%rax)
257; AVX512DQ-FCP-NEXT:    retq
258;
259; AVX512BW-LABEL: load_i16_stride6_vf2:
260; AVX512BW:       # %bb.0:
261; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
262; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
263; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
264; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
265; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
266; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
267; AVX512BW-NEXT:    vpbroadcastw 4(%rdi), %xmm4
268; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
269; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
270; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
271; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
272; AVX512BW-NEXT:    vpbroadcastw 20(%rdi), %xmm6
273; AVX512BW-NEXT:    vpbroadcastw 8(%rdi), %xmm7
274; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
275; AVX512BW-NEXT:    vpsrlq $48, %xmm1, %xmm1
276; AVX512BW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
277; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
278; AVX512BW-NEXT:    vmovd %xmm3, (%rsi)
279; AVX512BW-NEXT:    vmovd %xmm2, (%rdx)
280; AVX512BW-NEXT:    vmovd %xmm4, (%rcx)
281; AVX512BW-NEXT:    vmovd %xmm5, (%r8)
282; AVX512BW-NEXT:    vmovd %xmm6, (%r9)
283; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
284; AVX512BW-NEXT:    retq
285;
286; AVX512BW-FCP-LABEL: load_i16_stride6_vf2:
287; AVX512BW-FCP:       # %bb.0:
288; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
289; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
290; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
291; AVX512BW-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
292; AVX512BW-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
293; AVX512BW-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
294; AVX512BW-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
295; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
296; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11]
297; AVX512BW-FCP-NEXT:    vpermw (%rdi), %ymm5, %ymm5
298; AVX512BW-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
299; AVX512BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
300; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
301; AVX512BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm1
302; AVX512BW-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
303; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
304; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%rsi)
305; AVX512BW-FCP-NEXT:    vmovd %xmm2, (%rdx)
306; AVX512BW-FCP-NEXT:    vmovd %xmm4, (%rcx)
307; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%r8)
308; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%r9)
309; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%rax)
310; AVX512BW-FCP-NEXT:    vzeroupper
311; AVX512BW-FCP-NEXT:    retq
312;
313; AVX512DQ-BW-LABEL: load_i16_stride6_vf2:
314; AVX512DQ-BW:       # %bb.0:
315; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
316; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
317; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
318; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
319; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
320; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
321; AVX512DQ-BW-NEXT:    vpbroadcastw 4(%rdi), %xmm4
322; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
323; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
324; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
325; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
326; AVX512DQ-BW-NEXT:    vpbroadcastw 20(%rdi), %xmm6
327; AVX512DQ-BW-NEXT:    vpbroadcastw 8(%rdi), %xmm7
328; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
329; AVX512DQ-BW-NEXT:    vpsrlq $48, %xmm1, %xmm1
330; AVX512DQ-BW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
331; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
332; AVX512DQ-BW-NEXT:    vmovd %xmm3, (%rsi)
333; AVX512DQ-BW-NEXT:    vmovd %xmm2, (%rdx)
334; AVX512DQ-BW-NEXT:    vmovd %xmm4, (%rcx)
335; AVX512DQ-BW-NEXT:    vmovd %xmm5, (%r8)
336; AVX512DQ-BW-NEXT:    vmovd %xmm6, (%r9)
337; AVX512DQ-BW-NEXT:    vmovd %xmm0, (%rax)
338; AVX512DQ-BW-NEXT:    retq
339;
340; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf2:
341; AVX512DQ-BW-FCP:       # %bb.0:
342; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
343; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
344; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
345; AVX512DQ-BW-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
346; AVX512DQ-BW-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
347; AVX512DQ-BW-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
348; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
349; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
350; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11]
351; AVX512DQ-BW-FCP-NEXT:    vpermw (%rdi), %ymm5, %ymm5
352; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
353; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
354; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
355; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm1
356; AVX512DQ-BW-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
357; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
358; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%rsi)
359; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm2, (%rdx)
360; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm4, (%rcx)
361; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%r8)
362; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%r9)
363; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%rax)
364; AVX512DQ-BW-FCP-NEXT:    vzeroupper
365; AVX512DQ-BW-FCP-NEXT:    retq
366  %wide.vec = load <12 x i16>, ptr %in.vec, align 64
367  %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
368  %strided.vec1 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 1, i32 7>
369  %strided.vec2 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 2, i32 8>
370  %strided.vec3 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 3, i32 9>
371  %strided.vec4 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 4, i32 10>
372  %strided.vec5 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 5, i32 11>
373  store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
374  store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
375  store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
376  store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
377  store <2 x i16> %strided.vec4, ptr %out.vec4, align 64
378  store <2 x i16> %strided.vec5, ptr %out.vec5, align 64
379  ret void
380}
381
382define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
383; SSE-LABEL: load_i16_stride6_vf4:
384; SSE:       # %bb.0:
385; SSE-NEXT:    movdqa (%rdi), %xmm0
386; SSE-NEXT:    movdqa 16(%rdi), %xmm1
387; SSE-NEXT:    movdqa 32(%rdi), %xmm5
388; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
389; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
390; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
391; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
392; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
393; SSE-NEXT:    movdqa %xmm2, %xmm6
394; SSE-NEXT:    pandn %xmm5, %xmm6
395; SSE-NEXT:    movdqa %xmm1, %xmm7
396; SSE-NEXT:    psrld $16, %xmm7
397; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
398; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
399; SSE-NEXT:    pand %xmm2, %xmm3
400; SSE-NEXT:    por %xmm6, %xmm3
401; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3]
402; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3]
403; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
404; SSE-NEXT:    movdqa %xmm2, %xmm8
405; SSE-NEXT:    pandn %xmm5, %xmm8
406; SSE-NEXT:    movdqa %xmm0, %xmm5
407; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0]
408; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3]
409; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7]
410; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
411; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7]
412; SSE-NEXT:    pand %xmm2, %xmm9
413; SSE-NEXT:    por %xmm8, %xmm9
414; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
415; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
416; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
417; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
418; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
419; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
420; SSE-NEXT:    pand %xmm2, %xmm5
421; SSE-NEXT:    pandn %xmm6, %xmm2
422; SSE-NEXT:    por %xmm5, %xmm2
423; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
424; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
425; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
426; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7]
427; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
428; SSE-NEXT:    psrlq $48, %xmm1
429; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
430; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
431; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7]
432; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
433; SSE-NEXT:    movq %xmm4, (%rsi)
434; SSE-NEXT:    movq %xmm3, (%rdx)
435; SSE-NEXT:    movq %xmm9, (%rcx)
436; SSE-NEXT:    movq %xmm2, (%r8)
437; SSE-NEXT:    movq %xmm6, (%r9)
438; SSE-NEXT:    movq %xmm0, (%rax)
439; SSE-NEXT:    retq
440;
441; AVX-LABEL: load_i16_stride6_vf4:
442; AVX:       # %bb.0:
443; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
444; AVX-NEXT:    vmovdqa (%rdi), %xmm0
445; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
446; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
447; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
448; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
449; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
450; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
451; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
452; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
453; AVX-NEXT:    vpsrld $16, %xmm1, %xmm5
454; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
455; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
456; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
457; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
458; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
459; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
460; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7]
461; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7]
462; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
463; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1]
464; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
465; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
466; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
467; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
468; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
469; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm1
470; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
471; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
472; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7]
473; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
474; AVX-NEXT:    vmovq %xmm4, (%rsi)
475; AVX-NEXT:    vmovq %xmm3, (%rdx)
476; AVX-NEXT:    vmovq %xmm5, (%rcx)
477; AVX-NEXT:    vmovq %xmm6, (%r8)
478; AVX-NEXT:    vmovq %xmm7, (%r9)
479; AVX-NEXT:    vmovq %xmm0, (%rax)
480; AVX-NEXT:    retq
481;
482; AVX2-LABEL: load_i16_stride6_vf4:
483; AVX2:       # %bb.0:
484; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
485; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
486; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
487; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
488; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
489; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
490; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
491; AVX2-NEXT:    vpsrld $16, %xmm1, %xmm4
492; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
493; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
494; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
495; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
496; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
497; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
498; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
499; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
500; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
501; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
502; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
503; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
504; AVX2-NEXT:    vmovq %xmm3, (%rsi)
505; AVX2-NEXT:    vmovq %xmm4, (%rdx)
506; AVX2-NEXT:    vmovq %xmm6, (%rcx)
507; AVX2-NEXT:    vmovq %xmm5, (%r8)
508; AVX2-NEXT:    vmovq %xmm1, (%r9)
509; AVX2-NEXT:    vmovq %xmm0, (%rax)
510; AVX2-NEXT:    retq
511;
512; AVX2-FP-LABEL: load_i16_stride6_vf4:
513; AVX2-FP:       # %bb.0:
514; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
515; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
516; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
517; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm2
518; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
519; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
520; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
521; AVX2-FP-NEXT:    vpsrld $16, %xmm1, %xmm4
522; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
523; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
524; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
525; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
526; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
527; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
528; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
529; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
530; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
531; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
532; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
533; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
534; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
535; AVX2-FP-NEXT:    vmovq %xmm6, (%rcx)
536; AVX2-FP-NEXT:    vmovq %xmm5, (%r8)
537; AVX2-FP-NEXT:    vmovq %xmm1, (%r9)
538; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
539; AVX2-FP-NEXT:    retq
540;
541; AVX2-FCP-LABEL: load_i16_stride6_vf4:
542; AVX2-FCP:       # %bb.0:
543; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
544; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
545; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
546; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
547; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
548; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
549; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
550; AVX2-FCP-NEXT:    vpsrld $16, %xmm1, %xmm4
551; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
552; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
553; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
554; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
555; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
556; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
557; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
558; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
559; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
560; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
561; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
562; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
563; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
564; AVX2-FCP-NEXT:    vmovq %xmm6, (%rcx)
565; AVX2-FCP-NEXT:    vmovq %xmm5, (%r8)
566; AVX2-FCP-NEXT:    vmovq %xmm1, (%r9)
567; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
568; AVX2-FCP-NEXT:    retq
569;
570; AVX512-LABEL: load_i16_stride6_vf4:
571; AVX512:       # %bb.0:
572; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
573; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
574; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
575; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
576; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
577; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
578; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
579; AVX512-NEXT:    vpsrld $16, %xmm1, %xmm1
580; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
581; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
582; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
583; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
584; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
585; AVX512-NEXT:    vpermd (%rdi), %zmm1, %zmm1
586; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
587; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
588; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
589; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
590; AVX512-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
591; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
592; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
593; AVX512-NEXT:    vmovq %xmm3, (%rsi)
594; AVX512-NEXT:    vmovq %xmm0, (%rdx)
595; AVX512-NEXT:    vmovq %xmm2, (%rcx)
596; AVX512-NEXT:    vmovq %xmm1, (%r8)
597; AVX512-NEXT:    vmovq %xmm4, (%r9)
598; AVX512-NEXT:    vmovq %xmm5, (%rax)
599; AVX512-NEXT:    vzeroupper
600; AVX512-NEXT:    retq
601;
602; AVX512-FCP-LABEL: load_i16_stride6_vf4:
603; AVX512-FCP:       # %bb.0:
604; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
605; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
606; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
607; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
608; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
609; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
610; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
611; AVX512-FCP-NEXT:    vpsrld $16, %xmm1, %xmm1
612; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
613; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
614; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
615; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
616; AVX512-FCP-NEXT:    vpermd (%rdi), %zmm1, %zmm1
617; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
618; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
619; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
620; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
621; AVX512-FCP-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
622; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
623; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
624; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
625; AVX512-FCP-NEXT:    vmovq %xmm0, (%rdx)
626; AVX512-FCP-NEXT:    vmovq %xmm2, (%rcx)
627; AVX512-FCP-NEXT:    vmovq %xmm1, (%r8)
628; AVX512-FCP-NEXT:    vmovq %xmm4, (%r9)
629; AVX512-FCP-NEXT:    vmovq %xmm5, (%rax)
630; AVX512-FCP-NEXT:    vzeroupper
631; AVX512-FCP-NEXT:    retq
632;
633; AVX512DQ-LABEL: load_i16_stride6_vf4:
634; AVX512DQ:       # %bb.0:
635; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
636; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
637; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
638; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
639; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
640; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
641; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
642; AVX512DQ-NEXT:    vpsrld $16, %xmm1, %xmm1
643; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
644; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
645; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
646; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
647; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
648; AVX512DQ-NEXT:    vpermd (%rdi), %zmm1, %zmm1
649; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
650; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
651; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
652; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
653; AVX512DQ-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
654; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
655; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
656; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
657; AVX512DQ-NEXT:    vmovq %xmm0, (%rdx)
658; AVX512DQ-NEXT:    vmovq %xmm2, (%rcx)
659; AVX512DQ-NEXT:    vmovq %xmm1, (%r8)
660; AVX512DQ-NEXT:    vmovq %xmm4, (%r9)
661; AVX512DQ-NEXT:    vmovq %xmm5, (%rax)
662; AVX512DQ-NEXT:    vzeroupper
663; AVX512DQ-NEXT:    retq
664;
665; AVX512DQ-FCP-LABEL: load_i16_stride6_vf4:
666; AVX512DQ-FCP:       # %bb.0:
667; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
668; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
669; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
670; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
671; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
672; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
673; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
674; AVX512DQ-FCP-NEXT:    vpsrld $16, %xmm1, %xmm1
675; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
676; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
677; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
678; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
679; AVX512DQ-FCP-NEXT:    vpermd (%rdi), %zmm1, %zmm1
680; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
681; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
682; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
683; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
684; AVX512DQ-FCP-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
685; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
686; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
687; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
688; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rdx)
689; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rcx)
690; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r8)
691; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%r9)
692; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rax)
693; AVX512DQ-FCP-NEXT:    vzeroupper
694; AVX512DQ-FCP-NEXT:    retq
695;
696; AVX512BW-LABEL: load_i16_stride6_vf4:
697; AVX512BW:       # %bb.0:
698; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
699; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
700; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
701; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
702; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
703; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
704; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
705; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
706; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
707; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
708; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
709; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
710; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
711; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm1
712; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
713; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
714; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
715; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
716; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
717; AVX512BW-NEXT:    vmovq %xmm1, (%rax)
718; AVX512BW-NEXT:    vzeroupper
719; AVX512BW-NEXT:    retq
720;
721; AVX512BW-FCP-LABEL: load_i16_stride6_vf4:
722; AVX512BW-FCP:       # %bb.0:
723; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
724; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
725; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
726; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
727; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
728; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
729; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
730; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
731; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
732; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
733; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
734; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
735; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
736; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm1
737; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
738; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
739; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
740; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
741; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
742; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rax)
743; AVX512BW-FCP-NEXT:    vzeroupper
744; AVX512BW-FCP-NEXT:    retq
745;
746; AVX512DQ-BW-LABEL: load_i16_stride6_vf4:
747; AVX512DQ-BW:       # %bb.0:
748; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
749; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
750; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
751; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
752; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
753; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
754; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
755; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
756; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
757; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
758; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
759; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
760; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
761; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm1
762; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
763; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
764; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
765; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
766; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
767; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rax)
768; AVX512DQ-BW-NEXT:    vzeroupper
769; AVX512DQ-BW-NEXT:    retq
770;
771; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf4:
772; AVX512DQ-BW-FCP:       # %bb.0:
773; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
774; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
775; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
776; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
777; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
778; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
779; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
780; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
781; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
782; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
783; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
784; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
785; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
786; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm1
787; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
788; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
789; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
790; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
791; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
792; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rax)
793; AVX512DQ-BW-FCP-NEXT:    vzeroupper
794; AVX512DQ-BW-FCP-NEXT:    retq
795  %wide.vec = load <24 x i16>, ptr %in.vec, align 64
796  %strided.vec0 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18>
797  %strided.vec1 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 1, i32 7, i32 13, i32 19>
798  %strided.vec2 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 2, i32 8, i32 14, i32 20>
799  %strided.vec3 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 3, i32 9, i32 15, i32 21>
800  %strided.vec4 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 4, i32 10, i32 16, i32 22>
801  %strided.vec5 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 5, i32 11, i32 17, i32 23>
802  store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
803  store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
804  store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
805  store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
806  store <4 x i16> %strided.vec4, ptr %out.vec4, align 64
807  store <4 x i16> %strided.vec5, ptr %out.vec5, align 64
808  ret void
809}
810
811define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
812; SSE-LABEL: load_i16_stride6_vf8:
813; SSE:       # %bb.0:
814; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
815; SSE-NEXT:    movdqa 64(%rdi), %xmm1
816; SSE-NEXT:    movdqa 80(%rdi), %xmm8
817; SSE-NEXT:    movdqa (%rdi), %xmm3
818; SSE-NEXT:    movdqa 16(%rdi), %xmm5
819; SSE-NEXT:    movdqa 32(%rdi), %xmm6
820; SSE-NEXT:    movdqa 48(%rdi), %xmm4
821; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm6[0,1,1,2,4,5,6,7]
822; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535]
823; SSE-NEXT:    movdqa %xmm0, %xmm9
824; SSE-NEXT:    pandn %xmm2, %xmm9
825; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
826; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7]
827; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
828; SSE-NEXT:    pand %xmm0, %xmm2
829; SSE-NEXT:    por %xmm9, %xmm2
830; SSE-NEXT:    movdqa %xmm1, %xmm9
831; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[2,2,3,3]
832; SSE-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
833; SSE-NEXT:    movdqa %xmm8, %xmm12
834; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[3,0]
835; SSE-NEXT:    movaps %xmm1, %xmm10
836; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
837; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3]
838; SSE-NEXT:    pslld $16, %xmm8
839; SSE-NEXT:    psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
840; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
841; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3]
842; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm8[0,1,0,2,4,5,6,7]
843; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,1],xmm13[1,3]
844; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0]
845; SSE-NEXT:    movdqa %xmm5, %xmm9
846; SSE-NEXT:    psrld $16, %xmm9
847; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
848; SSE-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
849; SSE-NEXT:    pand %xmm0, %xmm7
850; SSE-NEXT:    pandn %xmm6, %xmm0
851; SSE-NEXT:    por %xmm7, %xmm0
852; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7]
853; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[3,1],xmm7[1,3]
854; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,0]
855; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1]
856; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[2,2,3,3]
857; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,1,0,3]
858; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
859; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0]
860; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535]
861; SSE-NEXT:    movdqa %xmm11, %xmm8
862; SSE-NEXT:    pandn %xmm6, %xmm8
863; SSE-NEXT:    movdqa %xmm3, %xmm13
864; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[1,0],xmm5[0,0]
865; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[2,3]
866; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7]
867; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
868; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm6[1,0,2,3,4,5,6,7]
869; SSE-NEXT:    pand %xmm11, %xmm14
870; SSE-NEXT:    por %xmm8, %xmm14
871; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
872; SSE-NEXT:    pand %xmm6, %xmm14
873; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[0,2]
874; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7]
875; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
876; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,5,4]
877; SSE-NEXT:    movdqa %xmm6, %xmm8
878; SSE-NEXT:    pandn %xmm12, %xmm8
879; SSE-NEXT:    por %xmm14, %xmm8
880; SSE-NEXT:    movdqa %xmm4, %xmm12
881; SSE-NEXT:    psrlq $48, %xmm12
882; SSE-NEXT:    punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm12[0]
883; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm13[3,1,2,3,4,5,6,7]
884; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3]
885; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
886; SSE-NEXT:    pand %xmm11, %xmm12
887; SSE-NEXT:    pandn %xmm9, %xmm11
888; SSE-NEXT:    por %xmm12, %xmm11
889; SSE-NEXT:    pand %xmm6, %xmm11
890; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7]
891; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,1,0,2]
892; SSE-NEXT:    movdqa %xmm6, %xmm9
893; SSE-NEXT:    pandn %xmm10, %xmm9
894; SSE-NEXT:    por %xmm11, %xmm9
895; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1]
896; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
897; SSE-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
898; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,5,4,6]
899; SSE-NEXT:    punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1]
900; SSE-NEXT:    movss {{.*#+}} xmm10 = xmm11[0],xmm10[1,2,3]
901; SSE-NEXT:    andps %xmm6, %xmm10
902; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm1[0,2,2,3,4,5,6,7]
903; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
904; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6]
905; SSE-NEXT:    movdqa %xmm6, %xmm12
906; SSE-NEXT:    pandn %xmm11, %xmm12
907; SSE-NEXT:    por %xmm10, %xmm12
908; SSE-NEXT:    psrlq $48, %xmm5
909; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
910; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
911; SSE-NEXT:    psrld $16, %xmm4
912; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7]
913; SSE-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1]
914; SSE-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
915; SSE-NEXT:    andps %xmm6, %xmm5
916; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
917; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
918; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
919; SSE-NEXT:    pandn %xmm1, %xmm6
920; SSE-NEXT:    por %xmm5, %xmm6
921; SSE-NEXT:    movaps %xmm2, (%rsi)
922; SSE-NEXT:    movaps %xmm0, (%rdx)
923; SSE-NEXT:    movdqa %xmm8, (%rcx)
924; SSE-NEXT:    movdqa %xmm9, (%r8)
925; SSE-NEXT:    movdqa %xmm12, (%r9)
926; SSE-NEXT:    movdqa %xmm6, (%rax)
927; SSE-NEXT:    retq
928;
929; AVX-LABEL: load_i16_stride6_vf8:
930; AVX:       # %bb.0:
931; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
932; AVX-NEXT:    vmovdqa (%rdi), %xmm0
933; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
934; AVX-NEXT:    vmovdqa 32(%rdi), %xmm4
935; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
936; AVX-NEXT:    vpsrlq $16, %xmm4, %xmm3
937; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3]
938; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
939; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
940; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3]
941; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7]
942; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
943; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7]
944; AVX-NEXT:    vmovdqa 80(%rdi), %xmm5
945; AVX-NEXT:    vpslld $16, %xmm5, %xmm9
946; AVX-NEXT:    vmovdqa 64(%rdi), %xmm6
947; AVX-NEXT:    vpsrldq {{.*#+}} xmm10 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
948; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
949; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
950; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[1,1,1,1]
951; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
952; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
953; AVX-NEXT:    vpsrld $16, %xmm2, %xmm9
954; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
955; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
956; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
957; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3]
958; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
959; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
960; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1]
961; AVX-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
962; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
963; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
964; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
965; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7]
966; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7]
967; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
968; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm11[5,6,7]
969; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm11
970; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[2,2,3,3]
971; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm11 = xmm12[0],xmm11[0]
972; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u]
973; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3,4],xmm9[5,6,7]
974; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
975; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
976; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[1,1,1,1]
977; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
978; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
979; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
980; AVX-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,4,6]
981; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1]
982; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7]
983; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
984; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
985; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
986; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm2
987; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
988; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
989; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
990; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7]
991; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
992; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
993; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
994; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
995; AVX-NEXT:    vmovdqa %xmm3, (%rsi)
996; AVX-NEXT:    vmovdqa %xmm7, (%rdx)
997; AVX-NEXT:    vmovdqa %xmm8, (%rcx)
998; AVX-NEXT:    vmovdqa %xmm9, (%r8)
999; AVX-NEXT:    vmovdqa %xmm6, (%r9)
1000; AVX-NEXT:    vmovdqa %xmm0, (%rax)
1001; AVX-NEXT:    retq
1002;
1003; AVX2-LABEL: load_i16_stride6_vf8:
1004; AVX2:       # %bb.0:
1005; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1006; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
1007; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm4
1008; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm0
1009; AVX2-NEXT:    vpslld $16, %xmm0, %xmm2
1010; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm1
1011; AVX2-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1012; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1013; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1014; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
1015; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm7
1016; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
1017; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
1018; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
1019; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
1020; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
1021; AVX2-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
1022; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
1023; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7]
1024; AVX2-NEXT:    vpbroadcastw 74(%rdi), %xmm6
1025; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1026; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1027; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1028; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
1029; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1030; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
1031; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1032; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
1033; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
1034; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1035; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1036; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1037; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1038; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
1039; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
1040; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
1041; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1042; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
1043; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1044; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1045; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
1046; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
1047; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1048; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1049; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1050; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1051; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
1052; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
1053; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
1054; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
1055; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
1056; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
1057; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1058; AVX2-NEXT:    vmovdqa %xmm2, (%rsi)
1059; AVX2-NEXT:    vmovdqa %xmm5, (%rdx)
1060; AVX2-NEXT:    vmovdqa %xmm8, (%rcx)
1061; AVX2-NEXT:    vmovdqa %xmm6, (%r8)
1062; AVX2-NEXT:    vmovdqa %xmm1, (%r9)
1063; AVX2-NEXT:    vmovdqa %xmm0, (%rax)
1064; AVX2-NEXT:    vzeroupper
1065; AVX2-NEXT:    retq
1066;
1067; AVX2-FP-LABEL: load_i16_stride6_vf8:
1068; AVX2-FP:       # %bb.0:
1069; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1070; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
1071; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm2
1072; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
1073; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
1074; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
1075; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm4
1076; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
1077; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1078; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7]
1079; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm3
1080; AVX2-FP-NEXT:    vpslld $16, %xmm3, %xmm7
1081; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm4
1082; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1083; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1084; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3]
1085; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
1086; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
1087; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
1088; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
1089; AVX2-FP-NEXT:    vpbroadcastw 74(%rdi), %xmm6
1090; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
1091; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1092; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
1093; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
1094; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1095; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm6
1096; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1097; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
1098; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1099; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
1100; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1101; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1102; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
1103; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
1104; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1105; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
1106; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1107; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
1108; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
1109; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
1110; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1111; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1112; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
1113; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1114; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
1115; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
1116; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
1117; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
1118; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
1119; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
1120; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
1121; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rsi)
1122; AVX2-FP-NEXT:    vmovdqa %xmm5, (%rdx)
1123; AVX2-FP-NEXT:    vmovdqa %xmm8, (%rcx)
1124; AVX2-FP-NEXT:    vmovdqa %xmm6, (%r8)
1125; AVX2-FP-NEXT:    vmovdqa %xmm4, (%r9)
1126; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rax)
1127; AVX2-FP-NEXT:    vzeroupper
1128; AVX2-FP-NEXT:    retq
1129;
1130; AVX2-FCP-LABEL: load_i16_stride6_vf8:
1131; AVX2-FCP:       # %bb.0:
1132; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1133; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1134; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
1135; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
1136; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
1137; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
1138; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm4
1139; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
1140; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1141; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7]
1142; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
1143; AVX2-FCP-NEXT:    vpslld $16, %xmm3, %xmm7
1144; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
1145; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1146; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1147; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3]
1148; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
1149; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
1150; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
1151; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
1152; AVX2-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm6
1153; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
1154; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1155; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
1156; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
1157; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1158; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
1159; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1160; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
1161; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1162; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
1163; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1164; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1165; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
1166; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
1167; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1168; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
1169; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1170; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
1171; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
1172; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
1173; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1174; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1175; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
1176; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1177; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
1178; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
1179; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
1180; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
1181; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
1182; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
1183; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
1184; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rsi)
1185; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
1186; AVX2-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
1187; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%r8)
1188; AVX2-FCP-NEXT:    vmovdqa %xmm4, (%r9)
1189; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rax)
1190; AVX2-FCP-NEXT:    vzeroupper
1191; AVX2-FCP-NEXT:    retq
1192;
1193; AVX512-LABEL: load_i16_stride6_vf8:
1194; AVX512:       # %bb.0:
1195; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1196; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm0
1197; AVX512-NEXT:    vpslld $16, %xmm0, %xmm2
1198; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm1
1199; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1200; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1201; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
1202; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
1203; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1204; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
1205; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm7
1206; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
1207; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
1208; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
1209; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
1210; AVX512-NEXT:    vpbroadcastw 74(%rdi), %xmm6
1211; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1212; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
1213; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
1214; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
1215; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
1216; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
1217; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
1218; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
1219; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1220; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1221; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
1222; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1223; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm6
1224; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1225; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
1226; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
1227; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1228; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1229; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1230; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1231; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
1232; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
1233; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
1234; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1235; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
1236; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1237; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1238; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
1239; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
1240; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1241; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1242; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1243; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1244; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
1245; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
1246; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
1247; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
1248; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
1249; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
1250; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1251; AVX512-NEXT:    vmovdqa %xmm2, (%rsi)
1252; AVX512-NEXT:    vmovdqa %xmm5, (%rdx)
1253; AVX512-NEXT:    vmovdqa %xmm8, (%rcx)
1254; AVX512-NEXT:    vmovdqa %xmm6, (%r8)
1255; AVX512-NEXT:    vmovdqa %xmm1, (%r9)
1256; AVX512-NEXT:    vmovdqa %xmm0, (%rax)
1257; AVX512-NEXT:    vzeroupper
1258; AVX512-NEXT:    retq
1259;
1260; AVX512-FCP-LABEL: load_i16_stride6_vf8:
1261; AVX512-FCP:       # %bb.0:
1262; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1263; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u]
1264; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
1265; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
1266; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1267; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
1268; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm4
1269; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
1270; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1271; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
1272; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
1273; AVX512-FCP-NEXT:    vpslld $16, %xmm3, %xmm7
1274; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
1275; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1276; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1277; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3]
1278; AVX512-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm7
1279; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
1280; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
1281; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
1282; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7]
1283; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
1284; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
1285; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1286; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
1287; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1288; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
1289; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1290; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
1291; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1292; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
1293; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1294; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1295; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
1296; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
1297; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1298; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
1299; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1300; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1301; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
1302; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
1303; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1304; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1305; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
1306; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1307; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
1308; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
1309; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
1310; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
1311; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
1312; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
1313; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
1314; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
1315; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
1316; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
1317; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r8)
1318; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%r9)
1319; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%rax)
1320; AVX512-FCP-NEXT:    vzeroupper
1321; AVX512-FCP-NEXT:    retq
1322;
1323; AVX512DQ-LABEL: load_i16_stride6_vf8:
1324; AVX512DQ:       # %bb.0:
1325; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1326; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm0
1327; AVX512DQ-NEXT:    vpslld $16, %xmm0, %xmm2
1328; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm1
1329; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1330; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1331; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
1332; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
1333; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1334; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
1335; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm7
1336; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
1337; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
1338; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
1339; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
1340; AVX512DQ-NEXT:    vpbroadcastw 74(%rdi), %xmm6
1341; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1342; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
1343; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
1344; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
1345; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
1346; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
1347; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
1348; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
1349; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1350; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1351; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
1352; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1353; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm6
1354; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1355; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
1356; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
1357; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1358; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1359; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1360; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1361; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
1362; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
1363; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
1364; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1365; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
1366; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1367; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1368; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
1369; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
1370; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1371; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1372; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1373; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1374; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
1375; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
1376; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
1377; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
1378; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
1379; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
1380; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1381; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rsi)
1382; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rdx)
1383; AVX512DQ-NEXT:    vmovdqa %xmm8, (%rcx)
1384; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r8)
1385; AVX512DQ-NEXT:    vmovdqa %xmm1, (%r9)
1386; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rax)
1387; AVX512DQ-NEXT:    vzeroupper
1388; AVX512DQ-NEXT:    retq
1389;
1390; AVX512DQ-FCP-LABEL: load_i16_stride6_vf8:
1391; AVX512DQ-FCP:       # %bb.0:
1392; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1393; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u]
1394; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
1395; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
1396; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1397; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
1398; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm4
1399; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
1400; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1401; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
1402; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
1403; AVX512DQ-FCP-NEXT:    vpslld $16, %xmm3, %xmm7
1404; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
1405; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1406; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1407; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3]
1408; AVX512DQ-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm7
1409; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
1410; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
1411; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
1412; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7]
1413; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
1414; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
1415; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1416; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
1417; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1418; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
1419; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1420; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
1421; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1422; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
1423; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1424; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1425; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
1426; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
1427; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1428; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
1429; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1430; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1431; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
1432; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
1433; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1434; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1435; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
1436; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1437; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
1438; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
1439; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
1440; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
1441; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
1442; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
1443; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
1444; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
1445; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
1446; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
1447; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r8)
1448; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%r9)
1449; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rax)
1450; AVX512DQ-FCP-NEXT:    vzeroupper
1451; AVX512DQ-FCP-NEXT:    retq
1452;
1453; AVX512BW-LABEL: load_i16_stride6_vf8:
1454; AVX512BW:       # %bb.0:
1455; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1456; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1457; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1458; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42]
1459; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1460; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43]
1461; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1462; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44]
1463; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
1464; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45]
1465; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
1466; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46]
1467; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
1468; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47]
1469; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
1470; AVX512BW-NEXT:    vmovdqa %xmm2, (%rsi)
1471; AVX512BW-NEXT:    vmovdqa %xmm3, (%rdx)
1472; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
1473; AVX512BW-NEXT:    vmovdqa %xmm5, (%r8)
1474; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
1475; AVX512BW-NEXT:    vmovdqa %xmm7, (%rax)
1476; AVX512BW-NEXT:    vzeroupper
1477; AVX512BW-NEXT:    retq
1478;
1479; AVX512BW-FCP-LABEL: load_i16_stride6_vf8:
1480; AVX512BW-FCP:       # %bb.0:
1481; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1482; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1483; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1484; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42]
1485; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1486; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43]
1487; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1488; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44]
1489; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
1490; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45]
1491; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
1492; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46]
1493; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
1494; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47]
1495; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
1496; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1497; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
1498; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
1499; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
1500; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
1501; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%rax)
1502; AVX512BW-FCP-NEXT:    vzeroupper
1503; AVX512BW-FCP-NEXT:    retq
1504;
1505; AVX512DQ-BW-LABEL: load_i16_stride6_vf8:
1506; AVX512DQ-BW:       # %bb.0:
1507; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1508; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1509; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
1510; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42]
1511; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1512; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43]
1513; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1514; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44]
1515; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
1516; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45]
1517; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
1518; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46]
1519; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
1520; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47]
1521; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
1522; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rsi)
1523; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rdx)
1524; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
1525; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%r8)
1526; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
1527; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, (%rax)
1528; AVX512DQ-BW-NEXT:    vzeroupper
1529; AVX512DQ-BW-NEXT:    retq
1530;
1531; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf8:
1532; AVX512DQ-BW-FCP:       # %bb.0:
1533; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1534; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
1535; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
1536; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42]
1537; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1538; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43]
1539; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1540; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44]
1541; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
1542; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45]
1543; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
1544; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46]
1545; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
1546; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47]
1547; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
1548; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1549; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
1550; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
1551; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
1552; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
1553; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%rax)
1554; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1555; AVX512DQ-BW-FCP-NEXT:    retq
1556  %wide.vec = load <48 x i16>, ptr %in.vec, align 64
1557  %strided.vec0 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
1558  %strided.vec1 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43>
1559  %strided.vec2 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44>
1560  %strided.vec3 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45>
1561  %strided.vec4 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46>
1562  %strided.vec5 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47>
1563  store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
1564  store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
1565  store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
1566  store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
1567  store <8 x i16> %strided.vec4, ptr %out.vec4, align 64
1568  store <8 x i16> %strided.vec5, ptr %out.vec5, align 64
1569  ret void
1570}
1571
1572define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
1573; SSE-LABEL: load_i16_stride6_vf16:
1574; SSE:       # %bb.0:
1575; SSE-NEXT:    subq $136, %rsp
1576; SSE-NEXT:    movdqa 112(%rdi), %xmm9
1577; SSE-NEXT:    movdqa 128(%rdi), %xmm7
1578; SSE-NEXT:    movdqa 64(%rdi), %xmm2
1579; SSE-NEXT:    movdqa 80(%rdi), %xmm11
1580; SSE-NEXT:    movdqa (%rdi), %xmm3
1581; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1582; SSE-NEXT:    movdqa 16(%rdi), %xmm6
1583; SSE-NEXT:    movdqa 32(%rdi), %xmm0
1584; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1585; SSE-NEXT:    movdqa 48(%rdi), %xmm8
1586; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
1587; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
1588; SSE-NEXT:    movdqa %xmm10, %xmm1
1589; SSE-NEXT:    pandn %xmm0, %xmm1
1590; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
1591; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1592; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1593; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
1594; SSE-NEXT:    pand %xmm10, %xmm0
1595; SSE-NEXT:    por %xmm1, %xmm0
1596; SSE-NEXT:    movdqa %xmm0, %xmm1
1597; SSE-NEXT:    movdqa %xmm2, %xmm13
1598; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[2,2,3,3]
1599; SSE-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1600; SSE-NEXT:    movdqa %xmm11, %xmm0
1601; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0]
1602; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1603; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1604; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0]
1605; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3]
1606; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1607; SSE-NEXT:    pslld $16, %xmm11
1608; SSE-NEXT:    psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1609; SSE-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
1610; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,3,2,3]
1611; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7]
1612; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3]
1613; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0]
1614; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1615; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7]
1616; SSE-NEXT:    movdqa %xmm10, %xmm3
1617; SSE-NEXT:    pandn %xmm0, %xmm3
1618; SSE-NEXT:    movdqa 96(%rdi), %xmm0
1619; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1620; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
1621; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
1622; SSE-NEXT:    movdqa %xmm9, %xmm11
1623; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1624; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
1625; SSE-NEXT:    pand %xmm10, %xmm2
1626; SSE-NEXT:    por %xmm3, %xmm2
1627; SSE-NEXT:    movdqa 160(%rdi), %xmm14
1628; SSE-NEXT:    movdqa 176(%rdi), %xmm3
1629; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3]
1630; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1631; SSE-NEXT:    movdqa %xmm3, %xmm1
1632; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0]
1633; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1634; SSE-NEXT:    movdqa %xmm14, %xmm13
1635; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1636; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[0,0]
1637; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm3[2,3]
1638; SSE-NEXT:    pslld $16, %xmm3
1639; SSE-NEXT:    psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1640; SSE-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
1641; SSE-NEXT:    movdqa 144(%rdi), %xmm1
1642; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1643; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1644; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7]
1645; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[3,1],xmm9[1,3]
1646; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0]
1647; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1648; SSE-NEXT:    movdqa %xmm6, %xmm13
1649; SSE-NEXT:    psrld $16, %xmm13
1650; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1651; SSE-NEXT:    # xmm9 = mem[0,1,2,3,5,7,6,7]
1652; SSE-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3]
1653; SSE-NEXT:    movdqa %xmm10, %xmm13
1654; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1655; SSE-NEXT:    pandn %xmm15, %xmm13
1656; SSE-NEXT:    pand %xmm10, %xmm9
1657; SSE-NEXT:    por %xmm13, %xmm9
1658; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
1659; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3]
1660; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0]
1661; SSE-NEXT:    movdqa %xmm11, %xmm4
1662; SSE-NEXT:    psrld $16, %xmm4
1663; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7]
1664; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1665; SSE-NEXT:    pand %xmm10, %xmm2
1666; SSE-NEXT:    movdqa %xmm7, %xmm5
1667; SSE-NEXT:    pandn %xmm7, %xmm10
1668; SSE-NEXT:    por %xmm2, %xmm10
1669; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1670; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3]
1671; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0]
1672; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1673; SSE-NEXT:    movdqa %xmm15, %xmm1
1674; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1675; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
1676; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1677; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535]
1678; SSE-NEXT:    movdqa %xmm2, %xmm4
1679; SSE-NEXT:    pandn %xmm1, %xmm4
1680; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1681; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0]
1683; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3]
1684; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7]
1685; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1686; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7]
1687; SSE-NEXT:    pand %xmm2, %xmm1
1688; SSE-NEXT:    por %xmm4, %xmm1
1689; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1690; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1691; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
1692; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1693; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
1694; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
1695; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
1696; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0]
1697; SSE-NEXT:    movdqa %xmm12, %xmm0
1698; SSE-NEXT:    pandn %xmm4, %xmm0
1699; SSE-NEXT:    pand %xmm12, %xmm1
1700; SSE-NEXT:    por %xmm1, %xmm0
1701; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1702; SSE-NEXT:    movdqa %xmm7, %xmm1
1703; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1704; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1705; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1706; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1]
1707; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
1708; SSE-NEXT:    movdqa %xmm2, %xmm4
1709; SSE-NEXT:    pandn %xmm1, %xmm4
1710; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1711; SSE-NEXT:    movaps %xmm10, %xmm13
1712; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1713; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0]
1714; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3]
1715; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7]
1716; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1717; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[1,0,2,3,4,5,6,7]
1718; SSE-NEXT:    pand %xmm2, %xmm0
1719; SSE-NEXT:    por %xmm4, %xmm0
1720; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1721; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1722; SSE-NEXT:    # xmm6 = xmm6[0,1],mem[0,2]
1723; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7]
1724; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1725; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,5,4]
1726; SSE-NEXT:    movdqa %xmm12, %xmm1
1727; SSE-NEXT:    pandn %xmm4, %xmm1
1728; SSE-NEXT:    pand %xmm12, %xmm0
1729; SSE-NEXT:    por %xmm0, %xmm1
1730; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1731; SSE-NEXT:    movdqa %xmm8, %xmm1
1732; SSE-NEXT:    movdqa %xmm8, (%rsp) # 16-byte Spill
1733; SSE-NEXT:    movdqa %xmm8, %xmm0
1734; SSE-NEXT:    psrlq $48, %xmm0
1735; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1736; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3]
1737; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
1738; SSE-NEXT:    movdqa %xmm2, %xmm0
1739; SSE-NEXT:    pandn %xmm4, %xmm0
1740; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
1741; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
1742; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
1743; SSE-NEXT:    pand %xmm2, %xmm3
1744; SSE-NEXT:    por %xmm0, %xmm3
1745; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1746; SSE-NEXT:    # xmm0 = mem[0,1,2,3,7,5,6,7]
1747; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1748; SSE-NEXT:    movdqa %xmm12, %xmm11
1749; SSE-NEXT:    pandn %xmm0, %xmm11
1750; SSE-NEXT:    pand %xmm12, %xmm3
1751; SSE-NEXT:    por %xmm3, %xmm11
1752; SSE-NEXT:    movdqa %xmm7, %xmm4
1753; SSE-NEXT:    movdqa %xmm7, %xmm0
1754; SSE-NEXT:    psrlq $48, %xmm0
1755; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3]
1756; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1757; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7]
1758; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1759; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
1760; SSE-NEXT:    pand %xmm2, %xmm0
1761; SSE-NEXT:    pandn %xmm3, %xmm2
1762; SSE-NEXT:    por %xmm0, %xmm2
1763; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,7]
1764; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1765; SSE-NEXT:    movdqa %xmm12, %xmm7
1766; SSE-NEXT:    pandn %xmm0, %xmm7
1767; SSE-NEXT:    pand %xmm12, %xmm2
1768; SSE-NEXT:    por %xmm2, %xmm7
1769; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1770; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
1771; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1772; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3]
1773; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1774; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm8[0,1,0,3]
1775; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,4,6]
1776; SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1]
1777; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3]
1778; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1779; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7]
1780; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1781; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6]
1782; SSE-NEXT:    movdqa %xmm12, %xmm1
1783; SSE-NEXT:    pandn %xmm2, %xmm1
1784; SSE-NEXT:    andps %xmm12, %xmm3
1785; SSE-NEXT:    por %xmm3, %xmm1
1786; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1]
1787; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3]
1788; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1789; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1790; SSE-NEXT:    # xmm0 = mem[0,1,0,3]
1791; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1792; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6]
1793; SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1]
1794; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
1795; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7]
1796; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
1797; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
1798; SSE-NEXT:    movdqa %xmm12, %xmm4
1799; SSE-NEXT:    pandn %xmm3, %xmm4
1800; SSE-NEXT:    andps %xmm12, %xmm2
1801; SSE-NEXT:    por %xmm2, %xmm4
1802; SSE-NEXT:    psrlq $48, %xmm5
1803; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1804; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1805; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
1806; SSE-NEXT:    psrld $16, %xmm0
1807; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,5,7]
1808; SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
1809; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3]
1810; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7]
1811; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
1812; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
1813; SSE-NEXT:    movdqa %xmm12, %xmm10
1814; SSE-NEXT:    pandn %xmm3, %xmm10
1815; SSE-NEXT:    andps %xmm12, %xmm2
1816; SSE-NEXT:    por %xmm2, %xmm10
1817; SSE-NEXT:    psrlq $48, %xmm15
1818; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1819; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1820; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
1821; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1822; SSE-NEXT:    psrld $16, %xmm3
1823; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1824; SSE-NEXT:    # xmm2 = mem[0,1,2,3,4,5,5,7]
1825; SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1826; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
1827; SSE-NEXT:    andps %xmm12, %xmm2
1828; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7]
1829; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
1830; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
1831; SSE-NEXT:    pandn %xmm3, %xmm12
1832; SSE-NEXT:    por %xmm2, %xmm12
1833; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1834; SSE-NEXT:    movaps %xmm0, 16(%rsi)
1835; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1836; SSE-NEXT:    movaps %xmm0, (%rsi)
1837; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1838; SSE-NEXT:    movaps %xmm0, 16(%rdx)
1839; SSE-NEXT:    movaps %xmm9, (%rdx)
1840; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1841; SSE-NEXT:    movaps %xmm0, 16(%rcx)
1842; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1843; SSE-NEXT:    movaps %xmm0, (%rcx)
1844; SSE-NEXT:    movdqa %xmm7, 16(%r8)
1845; SSE-NEXT:    movdqa %xmm11, (%r8)
1846; SSE-NEXT:    movdqa %xmm4, 16(%r9)
1847; SSE-NEXT:    movdqa %xmm1, (%r9)
1848; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1849; SSE-NEXT:    movdqa %xmm12, 16(%rax)
1850; SSE-NEXT:    movdqa %xmm10, (%rax)
1851; SSE-NEXT:    addq $136, %rsp
1852; SSE-NEXT:    retq
1853;
1854; AVX-LABEL: load_i16_stride6_vf16:
1855; AVX:       # %bb.0:
1856; AVX-NEXT:    subq $88, %rsp
1857; AVX-NEXT:    vmovdqa 96(%rdi), %xmm0
1858; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1859; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3]
1860; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7]
1861; AVX-NEXT:    vmovdqa 112(%rdi), %xmm8
1862; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
1863; AVX-NEXT:    vmovdqa 80(%rdi), %xmm1
1864; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1865; AVX-NEXT:    vpslld $16, %xmm1, %xmm2
1866; AVX-NEXT:    vmovdqa 64(%rdi), %xmm4
1867; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1868; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1869; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm5
1870; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1871; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1872; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
1873; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1874; AVX-NEXT:    vmovdqa 32(%rdi), %xmm7
1875; AVX-NEXT:    vmovdqa 48(%rdi), %xmm6
1876; AVX-NEXT:    vpsrlq $16, %xmm7, %xmm10
1877; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[0,3,2,3]
1878; AVX-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm3[0,1,0,2,4,5,6,7]
1879; AVX-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1880; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3]
1881; AVX-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7]
1882; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
1883; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5],xmm11[6,7]
1884; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm5[3,4,5,6,7]
1885; AVX-NEXT:    vmovdqa 176(%rdi), %xmm10
1886; AVX-NEXT:    vpslld $16, %xmm10, %xmm5
1887; AVX-NEXT:    vmovdqa 160(%rdi), %xmm11
1888; AVX-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1889; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
1890; AVX-NEXT:    vmovdqa 128(%rdi), %xmm12
1891; AVX-NEXT:    vpsrlq $16, %xmm12, %xmm0
1892; AVX-NEXT:    vmovdqa 144(%rdi), %xmm1
1893; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1894; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1895; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7]
1896; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
1897; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7]
1898; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
1899; AVX-NEXT:    vandps %ymm2, %ymm14, %ymm2
1900; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1901; AVX-NEXT:    vandnps %ymm0, %ymm14, %ymm0
1902; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
1903; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1904; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7]
1905; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1906; AVX-NEXT:    vpsrld $16, %xmm8, %xmm2
1907; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1908; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1909; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3]
1910; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1911; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
1912; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
1913; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
1914; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1]
1915; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1916; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7]
1917; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1918; AVX-NEXT:    vpsrld $16, %xmm5, %xmm9
1919; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3]
1920; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
1921; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1922; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1923; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1]
1924; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1925; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3]
1926; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
1927; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
1928; AVX-NEXT:    vandps %ymm0, %ymm14, %ymm0
1929; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
1930; AVX-NEXT:    vandnps %ymm1, %ymm14, %ymm1
1931; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
1932; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1933; AVX-NEXT:    vmovdqa %xmm6, %xmm2
1934; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
1935; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1936; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1937; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15]
1938; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload
1939; AVX-NEXT:    # xmm6 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7]
1940; AVX-NEXT:    vpshufb %xmm1, %xmm6, %xmm3
1941; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7]
1942; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1943; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7]
1944; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
1945; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
1946; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm4[0,1,2,3],xmm13[4,5],xmm4[6,7]
1947; AVX-NEXT:    vpshufb %xmm9, %xmm15, %xmm13
1948; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm13, %ymm1
1949; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
1950; AVX-NEXT:    vandnps %ymm0, %ymm13, %ymm0
1951; AVX-NEXT:    vandps %ymm1, %ymm13, %ymm1
1952; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
1953; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1954; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
1955; AVX-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1956; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0]
1957; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7]
1958; AVX-NEXT:    vpshufb %xmm9, %xmm8, %xmm9
1959; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7]
1960; AVX-NEXT:    vandps %ymm0, %ymm14, %ymm0
1961; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
1962; AVX-NEXT:    vandnps %ymm1, %ymm14, %ymm1
1963; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
1964; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1965; AVX-NEXT:    vmovdqa %xmm2, %xmm0
1966; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm1
1967; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm7[2,2,3,3]
1968; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm9[0],xmm1[0]
1969; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15]
1970; AVX-NEXT:    vpshufb %xmm9, %xmm6, %xmm2
1971; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
1972; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm2
1973; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
1974; AVX-NEXT:    vpshufb %xmm3, %xmm15, %xmm9
1975; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm9, %ymm2
1976; AVX-NEXT:    vandnps %ymm1, %ymm13, %ymm1
1977; AVX-NEXT:    vandps %ymm2, %ymm13, %ymm2
1978; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
1979; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm2
1980; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3]
1981; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm9[0],xmm2[0]
1982; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
1983; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1984; AVX-NEXT:    vandps %ymm1, %ymm14, %ymm1
1985; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
1986; AVX-NEXT:    vandnps %ymm2, %ymm14, %ymm2
1987; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm14
1988; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1989; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
1990; AVX-NEXT:    vmovdqa %xmm5, %xmm13
1991; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
1992; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1993; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1994; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
1995; AVX-NEXT:    # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
1996; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
1997; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm8
1998; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm8, %ymm1
1999; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2000; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[1,1,1,1]
2001; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2002; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3]
2003; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
2004; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
2005; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5,4,6]
2006; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm0[1]
2007; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7]
2008; AVX-NEXT:    vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2009; AVX-NEXT:    vandnps %ymm1, %ymm9, %ymm1
2010; AVX-NEXT:    vandps %ymm9, %ymm8, %ymm8
2011; AVX-NEXT:    vorps %ymm1, %ymm8, %ymm1
2012; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
2013; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
2014; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3]
2015; AVX-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,5,4,6]
2016; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2017; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm5[1]
2018; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4],xmm3[5,6,7]
2019; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2020; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
2021; AVX-NEXT:    vpsrlq $48, %xmm6, %xmm3
2022; AVX-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2023; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
2024; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
2025; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
2026; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
2027; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm3
2028; AVX-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2029; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2030; AVX-NEXT:    vpsrld $16, %xmm0, %xmm4
2031; AVX-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7]
2032; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1]
2033; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
2034; AVX-NEXT:    vandnps %ymm2, %ymm9, %ymm2
2035; AVX-NEXT:    vandps %ymm3, %ymm9, %ymm3
2036; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
2037; AVX-NEXT:    vpshufb %xmm11, %xmm8, %xmm3
2038; AVX-NEXT:    vpsrld $16, %xmm5, %xmm4
2039; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,5,5,7]
2040; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1]
2041; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7]
2042; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2043; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
2044; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2045; AVX-NEXT:    vmovaps %ymm3, (%rsi)
2046; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2047; AVX-NEXT:    vmovaps %ymm0, (%rdx)
2048; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2049; AVX-NEXT:    vmovaps %ymm0, (%rcx)
2050; AVX-NEXT:    vmovaps %ymm14, (%r8)
2051; AVX-NEXT:    vmovaps %ymm1, (%r9)
2052; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2053; AVX-NEXT:    vmovaps %ymm2, (%rax)
2054; AVX-NEXT:    addq $88, %rsp
2055; AVX-NEXT:    vzeroupper
2056; AVX-NEXT:    retq
2057;
2058; AVX2-LABEL: load_i16_stride6_vf16:
2059; AVX2:       # %bb.0:
2060; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
2061; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm5
2062; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
2063; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
2064; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
2065; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm2
2066; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
2067; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm8[2,2,2,2,4,5,6,7]
2068; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
2069; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7]
2070; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
2071; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
2072; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
2073; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
2074; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm12
2075; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3]
2076; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
2077; AVX2-NEXT:    vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7]
2078; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3]
2079; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1]
2080; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2081; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
2082; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
2083; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
2084; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
2085; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm12[2,1,0,3]
2086; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
2087; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
2088; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2089; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2090; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm10[1,1,1,1,4,5,6,7]
2091; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[3,1,2,3,4,5,6,7]
2092; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3]
2093; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7]
2094; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,0,4,5,6,7]
2095; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
2096; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
2097; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7]
2098; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15]
2099; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
2100; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
2101; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
2102; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2103; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2104; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2105; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7]
2106; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7]
2107; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2108; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2109; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2110; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
2111; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
2112; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2113; AVX2-NEXT:    vextracti128 $1, %ymm12, %xmm12
2114; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2115; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7]
2116; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
2117; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2118; AVX2-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
2119; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2120; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4]
2121; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2122; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2123; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2124; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7]
2125; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
2126; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2127; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
2128; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
2129; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
2130; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2131; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
2132; AVX2-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
2133; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2134; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2]
2135; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2136; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2137; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
2138; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
2139; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7]
2140; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2141; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2142; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2143; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
2144; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
2145; AVX2-NEXT:    vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
2146; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
2147; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
2148; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3]
2149; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
2150; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
2151; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
2152; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
2153; AVX2-NEXT:    vpblendvb %ymm11, %ymm2, %ymm5, %ymm2
2154; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
2155; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2156; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7]
2157; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6]
2158; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
2159; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
2160; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
2161; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
2162; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
2163; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
2164; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
2165; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
2166; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
2167; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
2168; AVX2-NEXT:    vmovdqa %ymm3, (%rdx)
2169; AVX2-NEXT:    vmovdqa %ymm8, (%rcx)
2170; AVX2-NEXT:    vmovdqa %ymm9, (%r8)
2171; AVX2-NEXT:    vmovdqa %ymm5, (%r9)
2172; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2173; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
2174; AVX2-NEXT:    vzeroupper
2175; AVX2-NEXT:    retq
2176;
2177; AVX2-FP-LABEL: load_i16_stride6_vf16:
2178; AVX2-FP:       # %bb.0:
2179; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
2180; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm5
2181; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
2182; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm4
2183; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
2184; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm2
2185; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
2186; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
2187; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm7
2188; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
2189; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
2190; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
2191; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
2192; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
2193; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
2194; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm11, %xmm7
2195; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm12
2196; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
2197; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
2198; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7]
2199; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3]
2200; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1]
2201; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2202; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
2203; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
2204; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
2205; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
2206; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2207; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
2208; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
2209; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2210; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2211; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
2212; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
2213; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
2214; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
2215; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
2216; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
2217; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
2218; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
2219; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
2220; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
2221; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm10
2222; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
2223; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
2224; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
2225; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
2226; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
2227; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2228; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
2229; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3]
2230; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2231; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm11
2232; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
2233; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
2234; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2235; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2236; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
2237; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
2238; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
2239; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
2240; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
2241; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
2242; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
2243; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
2244; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
2245; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7]
2246; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
2247; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
2248; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
2249; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
2250; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
2251; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
2252; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
2253; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
2254; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm3
2255; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7]
2256; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2257; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7]
2258; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2259; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
2260; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
2261; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm5, %ymm7, %ymm5
2262; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
2263; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
2264; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
2265; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2266; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
2267; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
2268; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
2269; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
2270; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm3
2271; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm6
2272; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2273; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
2274; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
2275; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
2276; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
2277; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
2278; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
2279; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
2280; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
2281; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
2282; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2283; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2284; AVX2-FP-NEXT:    vmovaps %ymm1, (%rsi)
2285; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rdx)
2286; AVX2-FP-NEXT:    vmovdqa %ymm8, (%rcx)
2287; AVX2-FP-NEXT:    vmovdqa %ymm9, (%r8)
2288; AVX2-FP-NEXT:    vmovdqa %ymm2, (%r9)
2289; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2290; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rax)
2291; AVX2-FP-NEXT:    vzeroupper
2292; AVX2-FP-NEXT:    retq
2293;
2294; AVX2-FCP-LABEL: load_i16_stride6_vf16:
2295; AVX2-FCP:       # %bb.0:
2296; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
2297; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
2298; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
2299; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
2300; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
2301; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
2302; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
2303; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
2304; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm7
2305; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
2306; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
2307; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
2308; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
2309; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
2310; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
2311; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm7
2312; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
2313; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
2314; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
2315; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7]
2316; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3]
2317; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1]
2318; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2319; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
2320; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
2321; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
2322; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
2323; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2324; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
2325; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
2326; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2327; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2328; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
2329; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
2330; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
2331; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
2332; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
2333; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
2334; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
2335; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
2336; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
2337; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
2338; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm10
2339; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
2340; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
2341; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
2342; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
2343; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
2344; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2345; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
2346; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3]
2347; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2348; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
2349; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
2350; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
2351; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2352; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2353; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
2354; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
2355; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
2356; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
2357; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
2358; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
2359; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
2360; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
2361; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
2362; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7]
2363; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
2364; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
2365; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
2366; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
2367; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
2368; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
2369; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
2370; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
2371; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
2372; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7]
2373; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2374; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7]
2375; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2376; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
2377; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
2378; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm5, %ymm7, %ymm5
2379; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
2380; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
2381; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
2382; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2383; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
2384; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
2385; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
2386; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
2387; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
2388; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm6
2389; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2390; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
2391; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
2392; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
2393; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
2394; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
2395; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
2396; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
2397; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
2398; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
2399; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2400; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2401; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rsi)
2402; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
2403; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
2404; AVX2-FCP-NEXT:    vmovdqa %ymm9, (%r8)
2405; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%r9)
2406; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2407; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rax)
2408; AVX2-FCP-NEXT:    vzeroupper
2409; AVX2-FCP-NEXT:    retq
2410;
2411; AVX512-LABEL: load_i16_stride6_vf16:
2412; AVX512:       # %bb.0:
2413; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm0
2414; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
2415; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
2416; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
2417; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
2418; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
2419; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7]
2420; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm8
2421; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7]
2422; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
2423; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm9
2424; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3]
2425; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm7
2426; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2427; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
2428; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
2429; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
2430; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm13
2431; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3]
2432; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
2433; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7]
2434; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7]
2435; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15]
2436; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
2437; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
2438; AVX512-NEXT:    vpshufb %xmm9, %xmm13, %xmm12
2439; AVX512-NEXT:    vpshufb %xmm9, %xmm11, %xmm9
2440; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7]
2441; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
2442; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2443; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
2444; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
2445; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
2446; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
2447; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15]
2448; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
2449; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
2450; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
2451; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2452; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2453; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2454; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7]
2455; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7]
2456; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2457; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2458; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2459; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
2460; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
2461; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2462; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm12
2463; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2464; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7]
2465; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
2466; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2467; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
2468; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2469; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2470; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2471; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2472; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7]
2473; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
2474; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2475; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
2476; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
2477; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
2478; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2479; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
2480; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
2481; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2482; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2483; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2484; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2485; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2486; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
2487; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
2488; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
2489; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2490; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2491; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2492; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7
2493; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
2494; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
2495; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
2496; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2497; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7]
2498; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6]
2499; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
2500; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
2501; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7]
2502; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
2503; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
2504; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
2505; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
2506; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2507; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4
2508; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
2509; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
2510; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
2511; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7]
2512; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
2513; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
2514; AVX512-NEXT:    vmovdqa %ymm1, (%rsi)
2515; AVX512-NEXT:    vmovdqa %ymm5, (%rdx)
2516; AVX512-NEXT:    vmovdqa %ymm8, (%rcx)
2517; AVX512-NEXT:    vmovdqa %ymm9, (%r8)
2518; AVX512-NEXT:    vmovdqa %ymm7, (%r9)
2519; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2520; AVX512-NEXT:    vmovdqa %ymm0, (%rax)
2521; AVX512-NEXT:    vzeroupper
2522; AVX512-NEXT:    retq
2523;
2524; AVX512-FCP-LABEL: load_i16_stride6_vf16:
2525; AVX512-FCP:       # %bb.0:
2526; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
2527; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
2528; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
2529; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
2530; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
2531; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
2532; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm5
2533; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm6
2534; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3]
2535; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
2536; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7]
2537; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3]
2538; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm4, %ymm7
2539; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2540; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
2541; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
2542; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
2543; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
2544; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
2545; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm12
2546; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm13
2547; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm11
2548; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
2549; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
2550; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
2551; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
2552; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
2553; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
2554; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
2555; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7]
2556; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
2557; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7]
2558; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,5,5,5]
2559; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm13[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
2560; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2561; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2562; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
2563; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
2564; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
2565; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
2566; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2567; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2568; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2569; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
2570; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2571; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2572; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2573; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
2574; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
2575; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2576; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
2577; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2578; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
2579; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2580; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
2581; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2582; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2583; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2584; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2585; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
2586; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2587; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
2588; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
2589; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2590; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
2591; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
2592; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2593; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2594; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2595; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2596; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2597; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2598; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
2599; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
2600; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2601; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2602; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2603; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7
2604; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
2605; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
2606; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
2607; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm12
2608; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2609; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
2610; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7]
2611; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
2612; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
2613; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
2614; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
2615; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
2616; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
2617; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2618; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm11)
2619; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
2620; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
2621; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2622; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
2623; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
2624; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
2625; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rsi)
2626; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
2627; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
2628; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%r8)
2629; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%r9)
2630; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2631; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
2632; AVX512-FCP-NEXT:    vzeroupper
2633; AVX512-FCP-NEXT:    retq
2634;
2635; AVX512DQ-LABEL: load_i16_stride6_vf16:
2636; AVX512DQ:       # %bb.0:
2637; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm0
2638; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
2639; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
2640; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
2641; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
2642; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
2643; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7]
2644; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm8
2645; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7]
2646; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
2647; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm9
2648; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3]
2649; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm7
2650; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2651; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
2652; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
2653; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
2654; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm13
2655; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3]
2656; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
2657; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7]
2658; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7]
2659; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15]
2660; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
2661; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
2662; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm13, %xmm12
2663; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm11, %xmm9
2664; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7]
2665; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
2666; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2667; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
2668; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
2669; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
2670; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
2671; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15]
2672; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
2673; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
2674; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
2675; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2676; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2677; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2678; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7]
2679; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7]
2680; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2681; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2682; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2683; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
2684; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
2685; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2686; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm12
2687; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2688; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7]
2689; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
2690; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2691; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
2692; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2693; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2694; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2695; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2696; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7]
2697; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
2698; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2699; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
2700; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
2701; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
2702; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2703; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
2704; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
2705; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2706; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2707; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2708; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2709; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2710; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
2711; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
2712; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
2713; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2714; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2715; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2716; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7
2717; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
2718; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
2719; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
2720; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2721; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7]
2722; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6]
2723; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
2724; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
2725; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7]
2726; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
2727; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
2728; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
2729; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
2730; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2731; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4
2732; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
2733; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
2734; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
2735; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7]
2736; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
2737; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
2738; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rsi)
2739; AVX512DQ-NEXT:    vmovdqa %ymm5, (%rdx)
2740; AVX512DQ-NEXT:    vmovdqa %ymm8, (%rcx)
2741; AVX512DQ-NEXT:    vmovdqa %ymm9, (%r8)
2742; AVX512DQ-NEXT:    vmovdqa %ymm7, (%r9)
2743; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2744; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rax)
2745; AVX512DQ-NEXT:    vzeroupper
2746; AVX512DQ-NEXT:    retq
2747;
2748; AVX512DQ-FCP-LABEL: load_i16_stride6_vf16:
2749; AVX512DQ-FCP:       # %bb.0:
2750; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
2751; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
2752; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
2753; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
2754; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
2755; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
2756; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm5
2757; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm6
2758; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3]
2759; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
2760; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7]
2761; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3]
2762; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm4, %ymm7
2763; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2764; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
2765; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
2766; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
2767; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
2768; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
2769; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm12
2770; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm13
2771; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm11
2772; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
2773; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
2774; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
2775; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
2776; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
2777; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
2778; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
2779; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7]
2780; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
2781; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7]
2782; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,5,5,5]
2783; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm13[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
2784; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2785; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2786; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
2787; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
2788; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
2789; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
2790; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2791; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2792; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2793; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
2794; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2795; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
2796; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2797; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
2798; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
2799; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2800; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
2801; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2802; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
2803; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2804; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
2805; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2806; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2807; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2808; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2809; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
2810; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2811; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
2812; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
2813; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2814; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
2815; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
2816; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2817; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2818; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2819; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2820; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2821; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2822; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
2823; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
2824; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2825; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2826; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2827; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7
2828; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
2829; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
2830; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
2831; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm12
2832; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2833; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
2834; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7]
2835; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
2836; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
2837; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
2838; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
2839; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
2840; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
2841; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2842; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm11)
2843; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
2844; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
2845; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2846; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
2847; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
2848; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
2849; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rsi)
2850; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
2851; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
2852; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%r8)
2853; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%r9)
2854; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2855; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
2856; AVX512DQ-FCP-NEXT:    vzeroupper
2857; AVX512DQ-FCP-NEXT:    retq
2858;
2859; AVX512BW-LABEL: load_i16_stride6_vf16:
2860; AVX512BW:       # %bb.0:
2861; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2862; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2863; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
2864; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
2865; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
2866; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
2867; AVX512BW-NEXT:    vpermw %zmm5, %zmm0, %zmm0
2868; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
2869; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
2870; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
2871; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2872; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2873; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
2874; AVX512BW-NEXT:    vpermw %zmm5, %zmm1, %zmm1
2875; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
2876; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
2877; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
2878; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2879; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2880; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
2881; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm6
2882; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm7
2883; AVX512BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
2884; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
2885; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
2886; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
2887; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
2888; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
2889; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
2890; AVX512BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
2891; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
2892; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
2893; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
2894; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
2895; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
2896; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
2897; AVX512BW-NEXT:    vpermw %zmm5, %zmm7, %zmm7
2898; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
2899; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
2900; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
2901; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
2902; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
2903; AVX512BW-NEXT:    vpermw %zmm5, %zmm8, %zmm5
2904; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
2905; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
2906; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
2907; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
2908; AVX512BW-NEXT:    vmovdqa %ymm1, (%rdx)
2909; AVX512BW-NEXT:    vmovdqa %ymm2, (%rcx)
2910; AVX512BW-NEXT:    vmovdqa %ymm6, (%r8)
2911; AVX512BW-NEXT:    vmovdqa %ymm7, (%r9)
2912; AVX512BW-NEXT:    vmovdqa %ymm3, (%rax)
2913; AVX512BW-NEXT:    vzeroupper
2914; AVX512BW-NEXT:    retq
2915;
2916; AVX512BW-FCP-LABEL: load_i16_stride6_vf16:
2917; AVX512BW-FCP:       # %bb.0:
2918; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2919; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2920; AVX512BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
2921; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
2922; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
2923; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
2924; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm0, %zmm0
2925; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
2926; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
2927; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
2928; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2929; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2930; AVX512BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
2931; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm1, %zmm1
2932; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
2933; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
2934; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
2935; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2936; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2937; AVX512BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
2938; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
2939; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
2940; AVX512BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
2941; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
2942; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
2943; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
2944; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
2945; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
2946; AVX512BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
2947; AVX512BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
2948; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
2949; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
2950; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
2951; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
2952; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
2953; AVX512BW-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
2954; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm7, %zmm7
2955; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
2956; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
2957; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
2958; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
2959; AVX512BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
2960; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm8, %zmm5
2961; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
2962; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
2963; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
2964; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
2965; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
2966; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
2967; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
2968; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
2969; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rax)
2970; AVX512BW-FCP-NEXT:    vzeroupper
2971; AVX512BW-FCP-NEXT:    retq
2972;
2973; AVX512DQ-BW-LABEL: load_i16_stride6_vf16:
2974; AVX512DQ-BW:       # %bb.0:
2975; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2976; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2977; AVX512DQ-BW-NEXT:    # ymm0 = mem[0,1,0,1]
2978; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm3
2979; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
2980; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
2981; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm0, %zmm0
2982; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
2983; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
2984; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
2985; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2986; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2987; AVX512DQ-BW-NEXT:    # ymm1 = mem[0,1,0,1]
2988; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm1, %zmm1
2989; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
2990; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
2991; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
2992; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2993; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2994; AVX512DQ-BW-NEXT:    # ymm2 = mem[0,1,0,1]
2995; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm6
2996; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm7
2997; AVX512DQ-BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
2998; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
2999; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
3000; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
3001; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
3002; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
3003; AVX512DQ-BW-NEXT:    # ymm8 = mem[0,1,0,1]
3004; AVX512DQ-BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
3005; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
3006; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
3007; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
3008; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
3009; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
3010; AVX512DQ-BW-NEXT:    # ymm7 = mem[0,1,0,1]
3011; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm7, %zmm7
3012; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
3013; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
3014; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
3015; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
3016; AVX512DQ-BW-NEXT:    # ymm8 = mem[0,1,0,1]
3017; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm8, %zmm5
3018; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
3019; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
3020; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
3021; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rsi)
3022; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rdx)
3023; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%rcx)
3024; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%r8)
3025; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%r9)
3026; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%rax)
3027; AVX512DQ-BW-NEXT:    vzeroupper
3028; AVX512DQ-BW-NEXT:    retq
3029;
3030; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf16:
3031; AVX512DQ-BW-FCP:       # %bb.0:
3032; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3033; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
3034; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
3035; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
3036; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
3037; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
3038; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm0, %zmm0
3039; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
3040; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
3041; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
3042; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3043; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
3044; AVX512DQ-BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
3045; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm1, %zmm1
3046; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
3047; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
3048; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
3049; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3050; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
3051; AVX512DQ-BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
3052; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
3053; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
3054; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
3055; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
3056; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
3057; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
3058; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
3059; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
3060; AVX512DQ-BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
3061; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
3062; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
3063; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
3064; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
3065; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
3066; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
3067; AVX512DQ-BW-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
3068; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm7, %zmm7
3069; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
3070; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
3071; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
3072; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
3073; AVX512DQ-BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
3074; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm8, %zmm5
3075; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
3076; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
3077; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
3078; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
3079; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
3080; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
3081; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
3082; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
3083; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rax)
3084; AVX512DQ-BW-FCP-NEXT:    vzeroupper
3085; AVX512DQ-BW-FCP-NEXT:    retq
3086  %wide.vec = load <96 x i16>, ptr %in.vec, align 64
3087  %strided.vec0 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90>
3088  %strided.vec1 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91>
3089  %strided.vec2 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92>
3090  %strided.vec3 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93>
3091  %strided.vec4 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94>
3092  %strided.vec5 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95>
3093  store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
3094  store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
3095  store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
3096  store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
3097  store <16 x i16> %strided.vec4, ptr %out.vec4, align 64
3098  store <16 x i16> %strided.vec5, ptr %out.vec5, align 64
3099  ret void
3100}
3101
3102define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
3103; SSE-LABEL: load_i16_stride6_vf32:
3104; SSE:       # %bb.0:
3105; SSE-NEXT:    subq $456, %rsp # imm = 0x1C8
3106; SSE-NEXT:    movdqa 304(%rdi), %xmm9
3107; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3108; SSE-NEXT:    movdqa 320(%rdi), %xmm5
3109; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3110; SSE-NEXT:    movdqa 64(%rdi), %xmm3
3111; SSE-NEXT:    movdqa 80(%rdi), %xmm0
3112; SSE-NEXT:    movdqa (%rdi), %xmm4
3113; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3114; SSE-NEXT:    movdqa 16(%rdi), %xmm6
3115; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3116; SSE-NEXT:    movdqa 32(%rdi), %xmm1
3117; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3118; SSE-NEXT:    movdqa 48(%rdi), %xmm7
3119; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3120; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7]
3121; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
3122; SSE-NEXT:    movdqa %xmm10, %xmm2
3123; SSE-NEXT:    pandn %xmm1, %xmm2
3124; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3]
3125; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3126; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3127; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
3128; SSE-NEXT:    pand %xmm10, %xmm1
3129; SSE-NEXT:    por %xmm2, %xmm1
3130; SSE-NEXT:    movdqa %xmm1, %xmm2
3131; SSE-NEXT:    movdqa %xmm3, %xmm1
3132; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3]
3133; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3134; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3135; SSE-NEXT:    movdqa %xmm0, %xmm4
3136; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
3137; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3138; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3139; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
3140; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
3141; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3142; SSE-NEXT:    pslld $16, %xmm0
3143; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3144; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3145; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,3,2,3]
3146; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[0,1,0,2,4,5,6,7]
3147; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3]
3148; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
3149; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3150; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,2,4,5,6,7]
3151; SSE-NEXT:    movdqa %xmm10, %xmm4
3152; SSE-NEXT:    movdqa %xmm10, %xmm1
3153; SSE-NEXT:    pandn %xmm0, %xmm1
3154; SSE-NEXT:    movdqa 288(%rdi), %xmm0
3155; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3156; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3157; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3158; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3159; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
3160; SSE-NEXT:    pand %xmm10, %xmm0
3161; SSE-NEXT:    por %xmm1, %xmm0
3162; SSE-NEXT:    movdqa %xmm0, %xmm5
3163; SSE-NEXT:    movdqa 352(%rdi), %xmm2
3164; SSE-NEXT:    movdqa 368(%rdi), %xmm1
3165; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
3166; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3167; SSE-NEXT:    movdqa %xmm1, %xmm0
3168; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0]
3169; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3170; SSE-NEXT:    movdqa %xmm2, %xmm0
3171; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3172; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
3173; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
3174; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3175; SSE-NEXT:    pslld $16, %xmm1
3176; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3177; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3178; SSE-NEXT:    movdqa 336(%rdi), %xmm1
3179; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3180; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
3181; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7]
3182; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[1,3]
3183; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0]
3184; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3185; SSE-NEXT:    movdqa 224(%rdi), %xmm0
3186; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3187; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
3188; SSE-NEXT:    movdqa %xmm10, %xmm2
3189; SSE-NEXT:    pandn %xmm0, %xmm2
3190; SSE-NEXT:    movdqa 208(%rdi), %xmm5
3191; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3192; SSE-NEXT:    movdqa 192(%rdi), %xmm0
3193; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3194; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3195; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3196; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3197; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
3198; SSE-NEXT:    pand %xmm10, %xmm0
3199; SSE-NEXT:    por %xmm2, %xmm0
3200; SSE-NEXT:    movdqa %xmm0, %xmm2
3201; SSE-NEXT:    movdqa 256(%rdi), %xmm5
3202; SSE-NEXT:    movdqa 272(%rdi), %xmm7
3203; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[2,2,3,3]
3204; SSE-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
3205; SSE-NEXT:    movdqa %xmm7, %xmm0
3206; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0]
3207; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
3208; SSE-NEXT:    movdqa %xmm5, %xmm0
3209; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3210; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0]
3211; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm7[2,3]
3212; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3213; SSE-NEXT:    pslld $16, %xmm7
3214; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3215; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
3216; SSE-NEXT:    movdqa 240(%rdi), %xmm5
3217; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3218; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm5[0,3,2,3]
3219; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm14[0,1,0,2,4,5,6,7]
3220; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[1,3]
3221; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
3222; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3223; SSE-NEXT:    movdqa 128(%rdi), %xmm0
3224; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3225; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
3226; SSE-NEXT:    movdqa %xmm10, %xmm7
3227; SSE-NEXT:    pandn %xmm0, %xmm7
3228; SSE-NEXT:    movdqa 112(%rdi), %xmm11
3229; SSE-NEXT:    movdqa 96(%rdi), %xmm0
3230; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3231; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3]
3232; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7]
3233; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
3234; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3235; SSE-NEXT:    pand %xmm10, %xmm0
3236; SSE-NEXT:    por %xmm7, %xmm0
3237; SSE-NEXT:    movdqa 160(%rdi), %xmm5
3238; SSE-NEXT:    movdqa 176(%rdi), %xmm9
3239; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3]
3240; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
3241; SSE-NEXT:    movdqa %xmm9, %xmm2
3242; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0]
3243; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3244; SSE-NEXT:    movdqa %xmm5, %xmm10
3245; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3246; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm9[0,0]
3247; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[2,3]
3248; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3249; SSE-NEXT:    pslld $16, %xmm9
3250; SSE-NEXT:    psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3251; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
3252; SSE-NEXT:    movdqa 144(%rdi), %xmm2
3253; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3254; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm2[0,3,2,3]
3255; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7]
3256; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,1],xmm13[1,3]
3257; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,0]
3258; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3259; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3260; SSE-NEXT:    movdqa %xmm5, %xmm10
3261; SSE-NEXT:    psrld $16, %xmm10
3262; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3263; SSE-NEXT:    # xmm0 = mem[0,1,2,3,5,7,6,7]
3264; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
3265; SSE-NEXT:    movdqa %xmm4, %xmm10
3266; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3267; SSE-NEXT:    pandn %xmm2, %xmm10
3268; SSE-NEXT:    pand %xmm4, %xmm0
3269; SSE-NEXT:    movdqa %xmm4, %xmm13
3270; SSE-NEXT:    por %xmm10, %xmm0
3271; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
3272; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3273; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[1,3]
3274; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
3275; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3276; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3277; SSE-NEXT:    movdqa %xmm7, %xmm8
3278; SSE-NEXT:    psrld $16, %xmm8
3279; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3280; SSE-NEXT:    # xmm0 = mem[0,1,2,3,5,7,6,7]
3281; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
3282; SSE-NEXT:    movdqa %xmm13, %xmm8
3283; SSE-NEXT:    movdqa %xmm13, %xmm4
3284; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3285; SSE-NEXT:    pandn %xmm10, %xmm4
3286; SSE-NEXT:    pand %xmm13, %xmm0
3287; SSE-NEXT:    por %xmm4, %xmm0
3288; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
3289; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3]
3290; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
3291; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3292; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3293; SSE-NEXT:    psrld $16, %xmm1
3294; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3295; SSE-NEXT:    # xmm0 = mem[0,1,2,3,5,7,6,7]
3296; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3297; SSE-NEXT:    movdqa %xmm13, %xmm1
3298; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3299; SSE-NEXT:    pandn %xmm13, %xmm1
3300; SSE-NEXT:    pand %xmm8, %xmm0
3301; SSE-NEXT:    por %xmm1, %xmm0
3302; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm14[0,1,1,3,4,5,6,7]
3303; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3]
3304; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0]
3305; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3306; SSE-NEXT:    psrld $16, %xmm11
3307; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7]
3308; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3]
3309; SSE-NEXT:    pand %xmm8, %xmm3
3310; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3311; SSE-NEXT:    pandn %xmm11, %xmm8
3312; SSE-NEXT:    por %xmm3, %xmm8
3313; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7]
3314; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[1,3]
3315; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,0]
3316; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3317; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3318; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3319; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
3320; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
3321; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535]
3322; SSE-NEXT:    movdqa %xmm1, %xmm4
3323; SSE-NEXT:    pandn %xmm2, %xmm4
3324; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3325; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[0,0]
3326; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[2,3]
3327; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7]
3328; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3329; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm0[1,0,2,3,4,5,6,7]
3330; SSE-NEXT:    pand %xmm1, %xmm6
3331; SSE-NEXT:    por %xmm4, %xmm6
3332; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3333; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3334; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
3335; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3336; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3337; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
3338; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,5,4]
3339; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0]
3340; SSE-NEXT:    movdqa %xmm15, %xmm0
3341; SSE-NEXT:    pandn %xmm4, %xmm0
3342; SSE-NEXT:    pand %xmm15, %xmm6
3343; SSE-NEXT:    por %xmm6, %xmm0
3344; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3345; SSE-NEXT:    movdqa %xmm10, %xmm4
3346; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3347; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3348; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1]
3349; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
3350; SSE-NEXT:    movdqa %xmm1, %xmm6
3351; SSE-NEXT:    pandn %xmm4, %xmm6
3352; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3353; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[0,0]
3354; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3]
3355; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7]
3356; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
3357; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7]
3358; SSE-NEXT:    pand %xmm1, %xmm8
3359; SSE-NEXT:    por %xmm6, %xmm8
3360; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3361; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3362; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
3363; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3364; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
3365; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
3366; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
3367; SSE-NEXT:    movdqa %xmm15, %xmm0
3368; SSE-NEXT:    pandn %xmm5, %xmm0
3369; SSE-NEXT:    pand %xmm15, %xmm8
3370; SSE-NEXT:    por %xmm8, %xmm0
3371; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3372; SSE-NEXT:    movdqa %xmm13, %xmm5
3373; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3374; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3375; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1]
3376; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
3377; SSE-NEXT:    movdqa %xmm1, %xmm6
3378; SSE-NEXT:    pandn %xmm5, %xmm6
3379; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3380; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3381; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0]
3382; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3]
3383; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7]
3384; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
3385; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7]
3386; SSE-NEXT:    pand %xmm1, %xmm8
3387; SSE-NEXT:    por %xmm6, %xmm8
3388; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3389; SSE-NEXT:    shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload
3390; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
3391; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3392; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
3393; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
3394; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
3395; SSE-NEXT:    movdqa %xmm15, %xmm0
3396; SSE-NEXT:    pandn %xmm2, %xmm0
3397; SSE-NEXT:    pand %xmm15, %xmm8
3398; SSE-NEXT:    por %xmm8, %xmm0
3399; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
3400; SSE-NEXT:    psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3401; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3402; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1]
3403; SSE-NEXT:    punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm6[0]
3404; SSE-NEXT:    movdqa %xmm1, %xmm6
3405; SSE-NEXT:    pandn %xmm11, %xmm6
3406; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3407; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3408; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
3409; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
3410; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
3411; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
3412; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7]
3413; SSE-NEXT:    pand %xmm1, %xmm8
3414; SSE-NEXT:    por %xmm6, %xmm8
3415; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3416; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3417; SSE-NEXT:    # xmm11 = xmm11[0,1],mem[0,2]
3418; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,6,6,7]
3419; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0]
3420; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
3421; SSE-NEXT:    movdqa %xmm15, %xmm0
3422; SSE-NEXT:    pandn %xmm6, %xmm0
3423; SSE-NEXT:    pand %xmm15, %xmm8
3424; SSE-NEXT:    por %xmm8, %xmm0
3425; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3426; SSE-NEXT:    movdqa %xmm14, %xmm0
3427; SSE-NEXT:    movdqa %xmm14, %xmm6
3428; SSE-NEXT:    psrlq $48, %xmm6
3429; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3430; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[2,2,3,3]
3431; SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0]
3432; SSE-NEXT:    movdqa %xmm1, %xmm6
3433; SSE-NEXT:    pandn %xmm7, %xmm6
3434; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
3435; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
3436; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
3437; SSE-NEXT:    pand %xmm1, %xmm3
3438; SSE-NEXT:    por %xmm6, %xmm3
3439; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3440; SSE-NEXT:    # xmm6 = mem[0,1,2,3,7,5,6,7]
3441; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
3442; SSE-NEXT:    movdqa %xmm15, %xmm7
3443; SSE-NEXT:    pandn %xmm6, %xmm7
3444; SSE-NEXT:    pand %xmm15, %xmm3
3445; SSE-NEXT:    por %xmm3, %xmm7
3446; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3447; SSE-NEXT:    movdqa %xmm12, %xmm3
3448; SSE-NEXT:    psrlq $48, %xmm3
3449; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3450; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3]
3451; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
3452; SSE-NEXT:    movdqa %xmm1, %xmm3
3453; SSE-NEXT:    pandn %xmm6, %xmm3
3454; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
3455; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
3456; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7]
3457; SSE-NEXT:    pand %xmm1, %xmm4
3458; SSE-NEXT:    por %xmm3, %xmm4
3459; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3460; SSE-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,7]
3461; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
3462; SSE-NEXT:    movdqa %xmm15, %xmm6
3463; SSE-NEXT:    pandn %xmm3, %xmm6
3464; SSE-NEXT:    pand %xmm15, %xmm4
3465; SSE-NEXT:    por %xmm4, %xmm6
3466; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3467; SSE-NEXT:    movdqa %xmm10, %xmm3
3468; SSE-NEXT:    movdqa %xmm10, %xmm14
3469; SSE-NEXT:    psrlq $48, %xmm3
3470; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3]
3471; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
3472; SSE-NEXT:    movdqa %xmm1, %xmm3
3473; SSE-NEXT:    pandn %xmm4, %xmm3
3474; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7]
3475; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
3476; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7]
3477; SSE-NEXT:    pand %xmm1, %xmm4
3478; SSE-NEXT:    por %xmm3, %xmm4
3479; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3480; SSE-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,7]
3481; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
3482; SSE-NEXT:    movdqa %xmm15, %xmm5
3483; SSE-NEXT:    pandn %xmm3, %xmm5
3484; SSE-NEXT:    pand %xmm15, %xmm4
3485; SSE-NEXT:    por %xmm4, %xmm5
3486; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3487; SSE-NEXT:    movdqa %xmm9, %xmm3
3488; SSE-NEXT:    psrlq $48, %xmm3
3489; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3490; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[2,2,3,3]
3491; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
3492; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
3493; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
3494; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
3495; SSE-NEXT:    pand %xmm1, %xmm2
3496; SSE-NEXT:    pandn %xmm4, %xmm1
3497; SSE-NEXT:    por %xmm2, %xmm1
3498; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7]
3499; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
3500; SSE-NEXT:    movdqa %xmm15, %xmm3
3501; SSE-NEXT:    pandn %xmm2, %xmm3
3502; SSE-NEXT:    pand %xmm15, %xmm1
3503; SSE-NEXT:    por %xmm1, %xmm3
3504; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3505; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3506; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
3507; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3508; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
3509; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3510; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
3511; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,4,6]
3512; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3513; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
3514; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3515; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7]
3516; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
3517; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
3518; SSE-NEXT:    movdqa %xmm15, %xmm11
3519; SSE-NEXT:    pandn %xmm3, %xmm11
3520; SSE-NEXT:    andps %xmm15, %xmm1
3521; SSE-NEXT:    por %xmm1, %xmm11
3522; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3523; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1]
3524; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3525; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
3526; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3527; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3]
3528; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3529; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6]
3530; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3531; SSE-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm12[1]
3532; SSE-NEXT:    movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
3533; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3534; SSE-NEXT:    # xmm3 = mem[0,2,2,3,4,5,6,7]
3535; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
3536; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
3537; SSE-NEXT:    movdqa %xmm15, %xmm5
3538; SSE-NEXT:    pandn %xmm3, %xmm5
3539; SSE-NEXT:    andps %xmm15, %xmm4
3540; SSE-NEXT:    por %xmm4, %xmm5
3541; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3542; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
3543; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3544; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
3545; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
3546; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3547; SSE-NEXT:    # xmm0 = mem[0,1,0,3]
3548; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3549; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6]
3550; SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm14[1]
3551; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
3552; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3553; SSE-NEXT:    # xmm4 = mem[0,2,2,3,4,5,6,7]
3554; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
3555; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm4[0,1,2,3,4,5,4,6]
3556; SSE-NEXT:    movdqa %xmm15, %xmm4
3557; SSE-NEXT:    pandn %xmm14, %xmm4
3558; SSE-NEXT:    andps %xmm15, %xmm3
3559; SSE-NEXT:    por %xmm3, %xmm4
3560; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
3561; SSE-NEXT:    # xmm14 = mem[1,1,1,1]
3562; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3563; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
3564; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
3565; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm7[0,1,0,3]
3566; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,4,6]
3567; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1]
3568; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
3569; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3570; SSE-NEXT:    # xmm3 = mem[0,2,2,3,4,5,6,7]
3571; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
3572; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6]
3573; SSE-NEXT:    movdqa %xmm15, %xmm3
3574; SSE-NEXT:    pandn %xmm1, %xmm3
3575; SSE-NEXT:    andps %xmm15, %xmm0
3576; SSE-NEXT:    por %xmm0, %xmm3
3577; SSE-NEXT:    movdqa %xmm2, %xmm1
3578; SSE-NEXT:    psrlq $48, %xmm1
3579; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3580; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
3581; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3582; SSE-NEXT:    psrld $16, %xmm1
3583; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7]
3584; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3585; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
3586; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7]
3587; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
3588; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
3589; SSE-NEXT:    movdqa %xmm15, %xmm2
3590; SSE-NEXT:    pandn %xmm1, %xmm2
3591; SSE-NEXT:    andps %xmm15, %xmm0
3592; SSE-NEXT:    por %xmm0, %xmm2
3593; SSE-NEXT:    psrlq $48, %xmm13
3594; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3595; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3596; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
3597; SSE-NEXT:    movdqa %xmm0, %xmm1
3598; SSE-NEXT:    psrld $16, %xmm12
3599; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3600; SSE-NEXT:    # xmm0 = mem[0,1,2,3,4,5,5,7]
3601; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1]
3602; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
3603; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3604; SSE-NEXT:    # xmm1 = mem[3,1,2,3,4,5,6,7]
3605; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
3606; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7]
3607; SSE-NEXT:    movdqa %xmm15, %xmm1
3608; SSE-NEXT:    pandn %xmm8, %xmm1
3609; SSE-NEXT:    andps %xmm15, %xmm0
3610; SSE-NEXT:    por %xmm0, %xmm1
3611; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3612; SSE-NEXT:    psrlq $48, %xmm6
3613; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3614; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3615; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
3616; SSE-NEXT:    movdqa %xmm0, %xmm6
3617; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3618; SSE-NEXT:    psrld $16, %xmm7
3619; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3620; SSE-NEXT:    # xmm0 = mem[0,1,2,3,4,5,5,7]
3621; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1]
3622; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
3623; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3624; SSE-NEXT:    # xmm8 = mem[3,1,2,3,4,5,6,7]
3625; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
3626; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
3627; SSE-NEXT:    movdqa %xmm15, %xmm9
3628; SSE-NEXT:    pandn %xmm8, %xmm9
3629; SSE-NEXT:    andps %xmm15, %xmm0
3630; SSE-NEXT:    por %xmm0, %xmm9
3631; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3632; SSE-NEXT:    psrlq $48, %xmm6
3633; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3634; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3635; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
3636; SSE-NEXT:    movdqa %xmm0, %xmm6
3637; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3638; SSE-NEXT:    psrld $16, %xmm7
3639; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7]
3640; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1]
3641; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
3642; SSE-NEXT:    andps %xmm15, %xmm0
3643; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3644; SSE-NEXT:    # xmm8 = mem[3,1,2,3,4,5,6,7]
3645; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
3646; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
3647; SSE-NEXT:    pandn %xmm8, %xmm15
3648; SSE-NEXT:    por %xmm0, %xmm15
3649; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3650; SSE-NEXT:    movaps %xmm0, 16(%rsi)
3651; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3652; SSE-NEXT:    movaps %xmm0, 32(%rsi)
3653; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3654; SSE-NEXT:    movaps %xmm0, 48(%rsi)
3655; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3656; SSE-NEXT:    movaps %xmm0, (%rsi)
3657; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3658; SSE-NEXT:    movaps %xmm0, 16(%rdx)
3659; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3660; SSE-NEXT:    movaps %xmm0, 32(%rdx)
3661; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3662; SSE-NEXT:    movaps %xmm0, 48(%rdx)
3663; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3664; SSE-NEXT:    movaps %xmm0, (%rdx)
3665; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3666; SSE-NEXT:    movaps %xmm0, 16(%rcx)
3667; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
3668; SSE-NEXT:    movaps %xmm0, 32(%rcx)
3669; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3670; SSE-NEXT:    movaps %xmm0, 48(%rcx)
3671; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3672; SSE-NEXT:    movaps %xmm0, (%rcx)
3673; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3674; SSE-NEXT:    movaps %xmm0, 16(%r8)
3675; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3676; SSE-NEXT:    movaps %xmm0, 32(%r8)
3677; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3678; SSE-NEXT:    movaps %xmm0, 48(%r8)
3679; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3680; SSE-NEXT:    movaps %xmm0, (%r8)
3681; SSE-NEXT:    movdqa %xmm3, 16(%r9)
3682; SSE-NEXT:    movdqa %xmm4, 32(%r9)
3683; SSE-NEXT:    movdqa %xmm5, 48(%r9)
3684; SSE-NEXT:    movdqa %xmm11, (%r9)
3685; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3686; SSE-NEXT:    movdqa %xmm15, 16(%rax)
3687; SSE-NEXT:    movdqa %xmm9, 32(%rax)
3688; SSE-NEXT:    movdqa %xmm1, 48(%rax)
3689; SSE-NEXT:    movdqa %xmm2, (%rax)
3690; SSE-NEXT:    addq $456, %rsp # imm = 0x1C8
3691; SSE-NEXT:    retq
3692;
3693; AVX-LABEL: load_i16_stride6_vf32:
3694; AVX:       # %bb.0:
3695; AVX-NEXT:    subq $552, %rsp # imm = 0x228
3696; AVX-NEXT:    vmovdqa 96(%rdi), %xmm0
3697; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3698; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[0,1,0,3]
3699; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7]
3700; AVX-NEXT:    vmovdqa 112(%rdi), %xmm1
3701; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3702; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3703; AVX-NEXT:    vmovdqa 80(%rdi), %xmm1
3704; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3705; AVX-NEXT:    vpslld $16, %xmm1, %xmm1
3706; AVX-NEXT:    vmovdqa 64(%rdi), %xmm12
3707; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3708; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3709; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3710; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3711; AVX-NEXT:    vmovdqa (%rdi), %xmm3
3712; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3713; AVX-NEXT:    vmovdqa 16(%rdi), %xmm10
3714; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
3715; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3716; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
3717; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3718; AVX-NEXT:    vpsrlq $16, %xmm1, %xmm1
3719; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,3]
3720; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7]
3721; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3722; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,1,0,3]
3723; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7]
3724; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3]
3725; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
3726; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
3727; AVX-NEXT:    vmovdqa 176(%rdi), %xmm1
3728; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3729; AVX-NEXT:    vpslld $16, %xmm1, %xmm1
3730; AVX-NEXT:    vmovdqa 160(%rdi), %xmm2
3731; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3732; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3733; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3734; AVX-NEXT:    vmovdqa 128(%rdi), %xmm14
3735; AVX-NEXT:    vpsrlq $16, %xmm14, %xmm2
3736; AVX-NEXT:    vmovdqa 144(%rdi), %xmm3
3737; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3738; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,3]
3739; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
3740; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3741; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
3742; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3743; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
3744; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3745; AVX-NEXT:    vandnps %ymm1, %ymm5, %ymm1
3746; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
3747; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3748; AVX-NEXT:    vmovdqa 272(%rdi), %xmm0
3749; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
3750; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
3751; AVX-NEXT:    vmovdqa 256(%rdi), %xmm1
3752; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3753; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3754; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3755; AVX-NEXT:    vmovdqa 288(%rdi), %xmm1
3756; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3757; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3]
3758; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7]
3759; AVX-NEXT:    vmovdqa 304(%rdi), %xmm2
3760; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3761; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3762; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3763; AVX-NEXT:    vmovdqa 224(%rdi), %xmm1
3764; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3765; AVX-NEXT:    vpsrlq $16, %xmm1, %xmm1
3766; AVX-NEXT:    vmovdqa 240(%rdi), %xmm2
3767; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3768; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
3769; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7]
3770; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3771; AVX-NEXT:    vmovdqa 192(%rdi), %xmm2
3772; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3773; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,3]
3774; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7]
3775; AVX-NEXT:    vmovdqa 208(%rdi), %xmm2
3776; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3777; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3]
3778; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4,5],xmm9[6,7]
3779; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3,4,5,6,7]
3780; AVX-NEXT:    vmovdqa 368(%rdi), %xmm0
3781; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3782; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
3783; AVX-NEXT:    vmovdqa 352(%rdi), %xmm0
3784; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3785; AVX-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3786; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
3787; AVX-NEXT:    vmovdqa 320(%rdi), %xmm0
3788; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3789; AVX-NEXT:    vpsrlq $16, %xmm0, %xmm0
3790; AVX-NEXT:    vmovdqa 336(%rdi), %xmm1
3791; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3792; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
3793; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm1[0,1,0,2,4,5,6,7]
3794; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3795; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7]
3796; AVX-NEXT:    vandps %ymm5, %ymm2, %ymm2
3797; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
3798; AVX-NEXT:    vandnps %ymm0, %ymm5, %ymm0
3799; AVX-NEXT:    vmovaps %ymm5, %ymm9
3800; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
3801; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3802; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,7,6,7]
3803; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3804; AVX-NEXT:    vpsrld $16, %xmm11, %xmm2
3805; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3806; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3]
3807; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3808; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
3809; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
3810; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
3811; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3812; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
3813; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
3814; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,5,7,6,7]
3815; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3816; AVX-NEXT:    vpsrld $16, %xmm10, %xmm8
3817; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
3818; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5],xmm7[6,7]
3819; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3820; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7]
3821; AVX-NEXT:    vmovdqa %xmm14, %xmm7
3822; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1]
3823; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
3824; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3825; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm14[2,2,3,3]
3826; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3827; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
3828; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm6[6,7]
3829; AVX-NEXT:    vandps %ymm0, %ymm9, %ymm0
3830; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
3831; AVX-NEXT:    vandnps %ymm2, %ymm9, %ymm2
3832; AVX-NEXT:    vmovaps %ymm9, %ymm6
3833; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
3834; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3835; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7]
3836; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3837; AVX-NEXT:    vpsrld $16, %xmm15, %xmm2
3838; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3839; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3840; AVX-NEXT:    # xmm2 = mem[2,2,3,3]
3841; AVX-NEXT:    vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload
3842; AVX-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
3843; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
3844; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
3845; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3846; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
3847; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3848; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7]
3849; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3850; AVX-NEXT:    vpsrld $16, %xmm13, %xmm4
3851; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
3852; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
3853; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3854; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
3855; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3856; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
3857; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3858; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3859; AVX-NEXT:    # xmm2 = mem[2,2,3,3]
3860; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3861; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3862; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
3863; AVX-NEXT:    vandps %ymm6, %ymm0, %ymm0
3864; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3865; AVX-NEXT:    vandnps %ymm1, %ymm6, %ymm1
3866; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
3867; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3868; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3869; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
3870; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3871; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
3872; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15]
3873; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload
3874; AVX-NEXT:    # xmm2 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7]
3875; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3876; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
3877; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
3878; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
3879; AVX-NEXT:    # xmm1 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7]
3880; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3881; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
3882; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
3883; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload
3884; AVX-NEXT:    # xmm10 = mem[0,1,2,3],xmm12[4,5],mem[6,7]
3885; AVX-NEXT:    vpshufb %xmm2, %xmm10, %xmm6
3886; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm6
3887; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
3888; AVX-NEXT:    vandnps %ymm5, %ymm1, %ymm5
3889; AVX-NEXT:    vandps %ymm1, %ymm6, %ymm6
3890; AVX-NEXT:    vorps %ymm5, %ymm6, %ymm6
3891; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3892; AVX-NEXT:    # xmm5 = mem[1,1,1,1]
3893; AVX-NEXT:    vmovdqa %xmm7, %xmm12
3894; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3895; AVX-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3896; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0]
3897; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm8[4,5],xmm14[6,7]
3898; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
3899; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
3900; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3901; AVX-NEXT:    vandps %ymm6, %ymm8, %ymm6
3902; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
3903; AVX-NEXT:    vandnps %ymm7, %ymm8, %ymm7
3904; AVX-NEXT:    vorps %ymm7, %ymm6, %ymm6
3905; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3906; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3907; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1]
3908; AVX-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3909; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0]
3910; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload
3911; AVX-NEXT:    # xmm6 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7]
3912; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3913; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
3914; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
3915; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload
3916; AVX-NEXT:    # xmm7 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
3917; AVX-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
3918; AVX-NEXT:    vmovdqa (%rsp), %xmm6 # 16-byte Reload
3919; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload
3920; AVX-NEXT:    # xmm8 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
3921; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm15
3922; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm15, %ymm0
3923; AVX-NEXT:    vandnps %ymm11, %ymm1, %ymm11
3924; AVX-NEXT:    vandps %ymm1, %ymm0, %ymm0
3925; AVX-NEXT:    vorps %ymm0, %ymm11, %ymm11
3926; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3927; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
3928; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3929; AVX-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3930; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
3931; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload
3932; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
3933; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
3934; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm15[3,4],xmm2[5,6,7]
3935; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3936; AVX-NEXT:    vandps %ymm15, %ymm11, %ymm11
3937; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
3938; AVX-NEXT:    vandnps %ymm2, %ymm15, %ymm2
3939; AVX-NEXT:    vorps %ymm2, %ymm11, %ymm2
3940; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3941; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm2
3942; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3943; AVX-NEXT:    # xmm11 = mem[2,2,3,3]
3944; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm11[0],xmm2[0]
3945; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15]
3946; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3947; AVX-NEXT:    vpshufb %xmm11, %xmm3, %xmm9
3948; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm2[3,4],xmm9[5,6,7]
3949; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3950; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm3
3951; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
3952; AVX-NEXT:    vpshufb %xmm2, %xmm10, %xmm4
3953; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
3954; AVX-NEXT:    vandnps %ymm9, %ymm1, %ymm4
3955; AVX-NEXT:    vandps %ymm1, %ymm3, %ymm3
3956; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
3957; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3958; AVX-NEXT:    vpsrlq $48, %xmm10, %xmm4
3959; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3]
3960; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm9[0],xmm4[0]
3961; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
3962; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
3963; AVX-NEXT:    vandps %ymm3, %ymm15, %ymm3
3964; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
3965; AVX-NEXT:    vandnps %ymm4, %ymm15, %ymm4
3966; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
3967; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3968; AVX-NEXT:    vpsrlq $48, %xmm14, %xmm3
3969; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3970; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
3971; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
3972; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3973; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
3974; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
3975; AVX-NEXT:    vpshufb %xmm11, %xmm7, %xmm4
3976; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm5
3977; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
3978; AVX-NEXT:    vandnps %ymm3, %ymm1, %ymm3
3979; AVX-NEXT:    vandps %ymm1, %ymm4, %ymm1
3980; AVX-NEXT:    vorps %ymm3, %ymm1, %ymm1
3981; AVX-NEXT:    vpsrlq $48, %xmm13, %xmm3
3982; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3]
3983; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
3984; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3985; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
3986; AVX-NEXT:    vandps %ymm1, %ymm15, %ymm1
3987; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
3988; AVX-NEXT:    vandnps %ymm0, %ymm15, %ymm0
3989; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
3990; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3991; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3992; AVX-NEXT:    # xmm1 = mem[1,1,1,1]
3993; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3994; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
3995; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3996; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3997; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3998; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
3999; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4000; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
4001; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
4002; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
4003; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4004; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm14[1,1,1,1]
4005; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4006; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
4007; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
4008; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4009; AVX-NEXT:    # xmm0 = mem[0,1,0,3]
4010; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4011; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,6]
4012; AVX-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4013; AVX-NEXT:    # xmm5 = xmm5[1],mem[1]
4014; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3,4,5,6,7]
4015; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4016; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
4017; AVX-NEXT:    vandps %ymm2, %ymm5, %ymm5
4018; AVX-NEXT:    vorps %ymm1, %ymm5, %ymm1
4019; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4020; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
4021; AVX-NEXT:    # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
4022; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4023; AVX-NEXT:    # xmm7 = mem[0,1,0,3]
4024; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5,4,6]
4025; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm10[1]
4026; AVX-NEXT:    vpshufb %xmm3, %xmm5, %xmm9
4027; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
4028; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
4029; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7]
4030; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4031; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4032; AVX-NEXT:    # xmm8 = mem[1,1,1,1]
4033; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4034; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
4035; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
4036; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
4037; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4038; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
4039; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4040; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm10
4041; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
4042; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4043; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[1,1,1,1]
4044; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4045; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3]
4046; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
4047; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3]
4048; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,4,6]
4049; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4050; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1]
4051; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3,4,5,6,7]
4052; AVX-NEXT:    vandnps %ymm9, %ymm2, %ymm9
4053; AVX-NEXT:    vandps %ymm2, %ymm11, %ymm11
4054; AVX-NEXT:    vorps %ymm9, %ymm11, %ymm13
4055; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4056; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
4057; AVX-NEXT:    # xmm11 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
4058; AVX-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
4059; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4060; AVX-NEXT:    # xmm9 = mem[0,1,0,3]
4061; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5,4,6]
4062; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4063; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1]
4064; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4],xmm3[5,6,7]
4065; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
4066; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5,6,7]
4067; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4068; AVX-NEXT:    vpsrlq $48, %xmm12, %xmm12
4069; AVX-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4070; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
4071; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
4072; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4073; AVX-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
4074; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm4, %ymm4
4075; AVX-NEXT:    vpsrlq $48, %xmm14, %xmm13
4076; AVX-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4077; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
4078; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4079; AVX-NEXT:    vpsrld $16, %xmm6, %xmm14
4080; AVX-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4081; AVX-NEXT:    # xmm6 = mem[0,1,2,3,4,5,5,7]
4082; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm14[1]
4083; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7]
4084; AVX-NEXT:    vandnps %ymm4, %ymm2, %ymm4
4085; AVX-NEXT:    vandps %ymm2, %ymm6, %ymm6
4086; AVX-NEXT:    vorps %ymm4, %ymm6, %ymm4
4087; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4088; AVX-NEXT:    vpsrld $16, %xmm6, %xmm6
4089; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7]
4090; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1]
4091; AVX-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
4092; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
4093; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
4094; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
4095; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4096; AVX-NEXT:    vpsrlq $48, %xmm5, %xmm5
4097; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4098; AVX-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4099; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
4100; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4101; AVX-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
4102; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
4103; AVX-NEXT:    vpsrlq $48, %xmm15, %xmm6
4104; AVX-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4105; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
4106; AVX-NEXT:    vpsrld $16, %xmm1, %xmm7
4107; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,7]
4108; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm8[1],xmm7[1]
4109; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7]
4110; AVX-NEXT:    vandnps %ymm5, %ymm2, %ymm5
4111; AVX-NEXT:    vandps %ymm2, %ymm6, %ymm2
4112; AVX-NEXT:    vorps %ymm5, %ymm2, %ymm2
4113; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm5
4114; AVX-NEXT:    vpsrld $16, %xmm0, %xmm6
4115; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,7]
4116; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1]
4117; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
4118; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
4119; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
4120; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4121; AVX-NEXT:    vmovaps %ymm0, 32(%rsi)
4122; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4123; AVX-NEXT:    vmovaps %ymm5, (%rsi)
4124; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4125; AVX-NEXT:    vmovaps %ymm0, 32(%rdx)
4126; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4127; AVX-NEXT:    vmovaps %ymm0, (%rdx)
4128; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4129; AVX-NEXT:    vmovaps %ymm0, 32(%rcx)
4130; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4131; AVX-NEXT:    vmovaps %ymm0, (%rcx)
4132; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4133; AVX-NEXT:    vmovaps %ymm0, 32(%r8)
4134; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4135; AVX-NEXT:    vmovaps %ymm0, (%r8)
4136; AVX-NEXT:    vmovaps %ymm3, 32(%r9)
4137; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4138; AVX-NEXT:    vmovaps %ymm0, (%r9)
4139; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4140; AVX-NEXT:    vmovaps %ymm2, 32(%rax)
4141; AVX-NEXT:    vmovaps %ymm4, (%rax)
4142; AVX-NEXT:    addq $552, %rsp # imm = 0x228
4143; AVX-NEXT:    vzeroupper
4144; AVX-NEXT:    retq
4145;
4146; AVX2-LABEL: load_i16_stride6_vf32:
4147; AVX2:       # %bb.0:
4148; AVX2-NEXT:    subq $488, %rsp # imm = 0x1E8
4149; AVX2-NEXT:    vmovdqa (%rdi), %ymm5
4150; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4151; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm7
4152; AVX2-NEXT:    vmovdqu %ymm7, (%rsp) # 32-byte Spill
4153; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
4154; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
4155; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm10
4156; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm11
4157; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm2
4158; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm3
4159; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
4160; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
4161; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
4162; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4163; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
4164; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4165; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
4166; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
4167; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm4
4168; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
4169; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
4170; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
4171; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
4172; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7]
4173; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7]
4174; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
4175; AVX2-NEXT:    vpblendvb %ymm0, %ymm9, %ymm4, %ymm4
4176; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4177; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4178; AVX2-NEXT:    vmovdqa %ymm11, %ymm5
4179; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4180; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
4181; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
4182; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm9
4183; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7]
4184; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7]
4185; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4186; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4187; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
4188; AVX2-NEXT:    vpshufb %ymm6, %ymm11, %ymm6
4189; AVX2-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm6
4190; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4191; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
4192; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
4193; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
4194; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
4195; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
4196; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
4197; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
4198; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
4199; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4200; AVX2-NEXT:    vpshufb %ymm3, %ymm11, %ymm1
4201; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm3
4202; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7]
4203; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[1,1,2,3]
4204; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
4205; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
4206; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
4207; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
4208; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4209; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm9
4210; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3]
4211; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
4212; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
4213; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
4214; AVX2-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
4215; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
4216; AVX2-NEXT:    vpshufb %ymm15, %ymm12, %ymm1
4217; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4218; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm11
4219; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm13
4220; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7]
4221; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4222; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4223; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
4224; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
4225; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7]
4226; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
4227; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
4228; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4229; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
4230; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4231; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4232; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm0
4233; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4234; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm14
4235; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
4236; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7]
4237; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm5
4238; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7]
4239; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
4240; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4241; AVX2-NEXT:    vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
4242; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
4243; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
4244; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
4245; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,2,0,3]
4246; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
4247; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
4248; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4249; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
4250; AVX2-NEXT:    # ymm10 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7]
4251; AVX2-NEXT:    vpshufb %ymm15, %ymm10, %ymm15
4252; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7]
4253; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4254; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
4255; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
4256; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4257; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm9[2,1,0,3]
4258; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
4259; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
4260; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
4261; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6],xmm6[7]
4262; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
4263; AVX2-NEXT:    vpshufb %ymm6, %ymm12, %ymm7
4264; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7]
4265; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
4266; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7]
4267; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
4268; AVX2-NEXT:    vpshufb %xmm7, %xmm8, %xmm8
4269; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
4270; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3],xmm8[4,5],xmm1[6],xmm8[7]
4271; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4272; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
4273; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4274; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4275; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm1
4276; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,5,5,5]
4277; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
4278; AVX2-NEXT:    vpshufb %ymm6, %ymm10, %ymm2
4279; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,1,0,3]
4280; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
4281; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
4282; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
4283; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6],xmm3[7]
4284; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
4285; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
4286; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
4287; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4288; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
4289; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4290; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4291; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4292; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
4293; AVX2-NEXT:    # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
4294; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
4295; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4296; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1]
4297; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3]
4298; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7]
4299; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
4300; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4]
4301; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
4302; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4303; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4304; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
4305; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3]
4306; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
4307; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
4308; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7]
4309; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
4310; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7]
4311; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
4312; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4313; AVX2-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
4314; AVX2-NEXT:    vpshufb %ymm11, %ymm8, %ymm2
4315; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
4316; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
4317; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
4318; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4319; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4320; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
4321; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
4322; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4323; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1]
4324; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
4325; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7]
4326; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
4327; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4]
4328; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
4329; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4330; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
4331; AVX2-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
4332; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4333; AVX2-NEXT:    vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
4334; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
4335; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
4336; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
4337; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
4338; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7]
4339; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
4340; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7]
4341; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7]
4342; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
4343; AVX2-NEXT:    vpshufb %ymm11, %ymm2, %ymm11
4344; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
4345; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4]
4346; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7]
4347; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
4348; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
4349; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
4350; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
4351; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7]
4352; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7]
4353; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
4354; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
4355; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7]
4356; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
4357; AVX2-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
4358; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4359; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15]
4360; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
4361; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7]
4362; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
4363; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
4364; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
4365; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
4366; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
4367; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
4368; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
4369; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
4370; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4371; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
4372; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm1
4373; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
4374; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
4375; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
4376; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4377; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
4378; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
4379; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4380; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7]
4381; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
4382; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
4383; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm6
4384; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
4385; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4386; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
4387; AVX2-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
4388; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4389; AVX2-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
4390; AVX2-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
4391; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
4392; AVX2-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
4393; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
4394; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7]
4395; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
4396; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7]
4397; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
4398; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4399; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
4400; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
4401; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
4402; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4403; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
4404; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
4405; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4406; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4407; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
4408; AVX2-NEXT:    vpshufb %xmm8, %xmm7, %xmm2
4409; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
4410; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
4411; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
4412; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
4413; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4414; AVX2-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
4415; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4416; AVX2-NEXT:    vmovaps %ymm6, 32(%rsi)
4417; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4418; AVX2-NEXT:    vmovaps %ymm6, (%rsi)
4419; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4420; AVX2-NEXT:    vmovaps %ymm6, 32(%rdx)
4421; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4422; AVX2-NEXT:    vmovaps %ymm6, (%rdx)
4423; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4424; AVX2-NEXT:    vmovaps %ymm6, 32(%rcx)
4425; AVX2-NEXT:    vmovdqa %ymm9, (%rcx)
4426; AVX2-NEXT:    vmovdqa %ymm5, 32(%r8)
4427; AVX2-NEXT:    vmovdqa %ymm0, (%r8)
4428; AVX2-NEXT:    vmovdqa %ymm4, 32(%r9)
4429; AVX2-NEXT:    vmovdqa %ymm3, (%r9)
4430; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4431; AVX2-NEXT:    vmovdqa %ymm2, 32(%rax)
4432; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
4433; AVX2-NEXT:    addq $488, %rsp # imm = 0x1E8
4434; AVX2-NEXT:    vzeroupper
4435; AVX2-NEXT:    retq
4436;
4437; AVX2-FP-LABEL: load_i16_stride6_vf32:
4438; AVX2-FP:       # %bb.0:
4439; AVX2-FP-NEXT:    subq $456, %rsp # imm = 0x1C8
4440; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm5
4441; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4442; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm7
4443; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4444; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
4445; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
4446; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm9
4447; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm10
4448; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm2
4449; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm3
4450; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
4451; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
4452; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
4453; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4454; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
4455; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
4456; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
4457; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
4458; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm1, %ymm6
4459; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
4460; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
4461; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
4462; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
4463; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7]
4464; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
4465; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
4466; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm5
4467; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4468; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4469; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4470; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
4471; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
4472; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm8
4473; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7]
4474; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
4475; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4476; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
4477; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm11, %ymm4
4478; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm7, %ymm4, %ymm4
4479; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4480; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
4481; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
4482; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4483; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
4484; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
4485; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
4486; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
4487; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4488; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm11, %ymm1
4489; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm8, %xmm3
4490; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
4491; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
4492; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
4493; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
4494; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
4495; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4496; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
4497; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm1
4498; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
4499; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm11, %xmm1
4500; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
4501; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
4502; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
4503; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm10, %ymm1
4504; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4505; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm1
4506; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4507; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm2
4508; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4509; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
4510; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
4511; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
4512; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm9
4513; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm9, %xmm8
4514; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
4515; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4516; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
4517; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
4518; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4519; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm0
4520; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4521; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm8
4522; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
4523; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
4524; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm4
4525; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
4526; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
4527; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4528; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
4529; AVX2-FP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
4530; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm12
4531; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
4532; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
4533; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
4534; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7]
4535; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4536; AVX2-FP-NEXT:    vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload
4537; AVX2-FP-NEXT:    # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7]
4538; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
4539; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7]
4540; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4541; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
4542; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4543; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4544; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
4545; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
4546; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7]
4547; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7]
4548; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
4549; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm10, %ymm10
4550; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
4551; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
4552; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7]
4553; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
4554; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
4555; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
4556; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
4557; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4558; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15]
4559; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
4560; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4561; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm4, %xmm3
4562; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
4563; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
4564; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm15, %ymm3
4565; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
4566; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7]
4567; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7]
4568; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
4569; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
4570; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4571; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
4572; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4573; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4574; AVX2-FP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
4575; AVX2-FP-NEXT:    # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7]
4576; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4577; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4578; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
4579; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
4580; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
4581; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
4582; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
4583; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
4584; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
4585; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
4586; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4587; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4588; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
4589; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm6
4590; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1]
4591; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
4592; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
4593; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm10, %xmm9
4594; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7]
4595; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7]
4596; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4597; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
4598; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm14, %ymm12
4599; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15]
4600; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
4601; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7]
4602; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7]
4603; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4604; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4605; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
4606; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3]
4607; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
4608; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm9
4609; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
4610; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4]
4611; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
4612; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4613; AVX2-FP-NEXT:    vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload
4614; AVX2-FP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
4615; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4616; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4617; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
4618; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm11
4619; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
4620; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
4621; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4622; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
4623; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
4624; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4625; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm13, %ymm6
4626; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
4627; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
4628; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
4629; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7]
4630; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
4631; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
4632; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
4633; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
4634; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
4635; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm10, %xmm4
4636; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
4637; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
4638; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
4639; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm14, %ymm4
4640; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4641; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
4642; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
4643; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
4644; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4645; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
4646; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
4647; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
4648; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm13, %ymm2
4649; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm11, %xmm3
4650; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
4651; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
4652; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4653; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
4654; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
4655; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
4656; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4657; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
4658; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
4659; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4660; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
4661; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
4662; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm7
4663; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
4664; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
4665; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4666; AVX2-FP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
4667; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4668; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
4669; AVX2-FP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
4670; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
4671; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
4672; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm8, %xmm9
4673; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
4674; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7]
4675; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4676; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
4677; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
4678; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
4679; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
4680; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
4681; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
4682; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4683; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4684; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
4685; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm8, %xmm2
4686; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
4687; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
4688; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
4689; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4690; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
4691; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4692; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
4693; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4694; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
4695; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4696; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rdx)
4697; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4698; AVX2-FP-NEXT:    vmovaps %ymm7, (%rdx)
4699; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4700; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rcx)
4701; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rcx)
4702; AVX2-FP-NEXT:    vmovdqa %ymm4, 32(%r8)
4703; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r8)
4704; AVX2-FP-NEXT:    vmovdqa %ymm3, 32(%r9)
4705; AVX2-FP-NEXT:    vmovdqa %ymm5, (%r9)
4706; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4707; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%rax)
4708; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
4709; AVX2-FP-NEXT:    addq $456, %rsp # imm = 0x1C8
4710; AVX2-FP-NEXT:    vzeroupper
4711; AVX2-FP-NEXT:    retq
4712;
4713; AVX2-FCP-LABEL: load_i16_stride6_vf32:
4714; AVX2-FCP:       # %bb.0:
4715; AVX2-FCP-NEXT:    subq $456, %rsp # imm = 0x1C8
4716; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm5
4717; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4718; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
4719; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4720; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
4721; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
4722; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm9
4723; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm10
4724; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm2
4725; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm3
4726; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
4727; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
4728; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
4729; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4730; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
4731; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
4732; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
4733; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
4734; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm6
4735; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
4736; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
4737; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
4738; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
4739; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7]
4740; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
4741; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
4742; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm5
4743; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4744; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4745; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4746; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
4747; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
4748; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
4749; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7]
4750; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
4751; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4752; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
4753; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm4
4754; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm7, %ymm4, %ymm4
4755; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4756; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
4757; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
4758; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4759; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
4760; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
4761; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
4762; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
4763; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4764; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm1
4765; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm8, %xmm3
4766; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
4767; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
4768; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
4769; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
4770; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
4771; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4772; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
4773; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
4774; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
4775; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm1
4776; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
4777; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
4778; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
4779; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm1
4780; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4781; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm1
4782; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4783; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm2
4784; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4785; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
4786; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
4787; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
4788; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
4789; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm8
4790; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
4791; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4792; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
4793; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
4794; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4795; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
4796; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4797; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
4798; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
4799; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
4800; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
4801; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
4802; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
4803; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4804; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
4805; AVX2-FCP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
4806; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm12
4807; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
4808; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
4809; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
4810; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7]
4811; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4812; AVX2-FCP-NEXT:    vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload
4813; AVX2-FCP-NEXT:    # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7]
4814; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
4815; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7]
4816; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4817; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
4818; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4819; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4820; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
4821; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
4822; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7]
4823; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7]
4824; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
4825; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm10
4826; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
4827; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
4828; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7]
4829; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
4830; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
4831; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
4832; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
4833; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4834; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15]
4835; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
4836; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4837; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm3
4838; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
4839; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
4840; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm15, %ymm3
4841; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
4842; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7]
4843; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7]
4844; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
4845; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
4846; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4847; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
4848; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4849; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4850; AVX2-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
4851; AVX2-FCP-NEXT:    # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7]
4852; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4853; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4854; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
4855; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
4856; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
4857; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
4858; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
4859; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
4860; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
4861; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
4862; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4863; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4864; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
4865; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
4866; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1]
4867; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
4868; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
4869; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm9
4870; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7]
4871; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7]
4872; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4873; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
4874; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm14, %ymm12
4875; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15]
4876; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
4877; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7]
4878; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7]
4879; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4880; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4881; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
4882; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3]
4883; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
4884; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
4885; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
4886; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4]
4887; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
4888; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4889; AVX2-FCP-NEXT:    vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload
4890; AVX2-FCP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
4891; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4892; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4893; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
4894; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
4895; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
4896; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
4897; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4898; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
4899; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
4900; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4901; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm13, %ymm6
4902; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
4903; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
4904; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
4905; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7]
4906; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
4907; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
4908; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
4909; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
4910; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
4911; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm10, %xmm4
4912; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
4913; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
4914; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
4915; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm14, %ymm4
4916; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4917; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
4918; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
4919; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
4920; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4921; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
4922; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
4923; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
4924; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm2
4925; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm3
4926; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
4927; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
4928; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4929; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
4930; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
4931; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
4932; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4933; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
4934; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
4935; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4936; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
4937; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
4938; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm7
4939; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
4940; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
4941; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4942; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
4943; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4944; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
4945; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
4946; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
4947; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
4948; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm9
4949; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
4950; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7]
4951; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4952; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
4953; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
4954; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
4955; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
4956; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
4957; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
4958; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4959; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4960; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
4961; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm2
4962; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
4963; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
4964; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
4965; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4966; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
4967; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4968; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rsi)
4969; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4970; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rsi)
4971; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4972; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rdx)
4973; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4974; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rdx)
4975; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4976; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rcx)
4977; AVX2-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
4978; AVX2-FCP-NEXT:    vmovdqa %ymm4, 32(%r8)
4979; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r8)
4980; AVX2-FCP-NEXT:    vmovdqa %ymm3, 32(%r9)
4981; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%r9)
4982; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4983; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%rax)
4984; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
4985; AVX2-FCP-NEXT:    addq $456, %rsp # imm = 0x1C8
4986; AVX2-FCP-NEXT:    vzeroupper
4987; AVX2-FCP-NEXT:    retq
4988;
4989; AVX512-LABEL: load_i16_stride6_vf32:
4990; AVX512:       # %bb.0:
4991; AVX512-NEXT:    subq $72, %rsp
4992; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
4993; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm14
4994; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm11
4995; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7]
4996; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
4997; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm0
4998; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3]
4999; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
5000; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
5001; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm4
5002; AVX512-NEXT:    vmovdqa (%rdi), %ymm13
5003; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm10
5004; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm6
5005; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm7
5006; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7]
5007; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm24
5008; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm26
5009; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm4
5010; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7]
5011; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7]
5012; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
5013; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
5014; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5015; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm7, %zmm3
5016; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5017; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm6[2,3],mem[2,3]
5018; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
5019; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm16
5020; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm7
5021; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm5
5022; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3]
5023; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
5024; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7]
5025; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm8
5026; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7]
5027; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm29
5028; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm28
5029; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
5030; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7]
5031; AVX512-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
5032; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm8
5033; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm10
5034; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
5035; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm18
5036; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm20
5037; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
5038; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
5039; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7]
5040; AVX512-NEXT:    vpshufb %xmm9, %xmm10, %xmm9
5041; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm10
5042; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm9
5043; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm9[2,3],mem[2,3]
5044; AVX512-NEXT:    vinserti128 $1, 288(%rdi), %ymm9, %ymm15
5045; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0],ymm12[1],ymm15[2,3,4,5],ymm12[6],ymm15[7]
5046; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm25
5047; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm27
5048; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
5049; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
5050; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6]
5051; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
5052; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm30
5053; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
5054; AVX512-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
5055; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
5056; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
5057; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
5058; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
5059; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5060; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7]
5061; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5062; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5063; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5064; AVX512-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
5065; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm1
5066; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
5067; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
5068; AVX512-NEXT:    vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7]
5069; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
5070; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5]
5071; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
5072; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5073; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
5074; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
5075; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
5076; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5077; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm22
5078; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
5079; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm31
5080; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm21
5081; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
5082; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3]
5083; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1]
5084; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,3,4,5,6,7]
5085; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
5086; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7]
5087; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5088; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm1
5089; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm2
5090; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
5091; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
5092; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3]
5093; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7]
5094; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
5095; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1]
5096; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,4]
5097; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
5098; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5099; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
5100; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm23
5101; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm0
5102; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
5103; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm19
5104; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
5105; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
5106; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1]
5107; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7]
5108; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
5109; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
5110; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5111; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm13
5112; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm12
5113; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7]
5114; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
5115; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
5116; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5117; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm28
5118; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm29
5119; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm0
5120; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm1
5121; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
5122; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
5123; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
5124; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
5125; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
5126; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
5127; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
5128; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
5129; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm14
5130; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm0
5131; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm1
5132; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
5133; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
5134; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
5135; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
5136; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
5137; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16
5138; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5139; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm17 ^ (zmm0 & (zmm2 ^ zmm17))
5140; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
5141; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm2))
5142; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7]
5143; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7]
5144; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
5145; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7]
5146; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5]
5147; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
5148; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
5149; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
5150; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5151; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm8, %zmm2
5152; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm30
5153; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm18
5154; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
5155; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
5156; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
5157; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
5158; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
5159; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
5160; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
5161; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
5162; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
5163; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
5164; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
5165; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
5166; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
5167; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
5168; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
5169; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
5170; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm0 & (zmm5 ^ zmm2))
5171; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
5172; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm5))
5173; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
5174; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm0
5175; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
5176; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
5177; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm2
5178; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
5179; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
5180; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
5181; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm3
5182; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm4
5183; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
5184; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
5185; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1]
5186; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
5187; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
5188; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
5189; AVX512-NEXT:    vpshufb %xmm9, %xmm4, %xmm5
5190; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7]
5191; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
5192; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
5193; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7]
5194; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
5195; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
5196; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
5197; AVX512-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
5198; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
5199; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
5200; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7]
5201; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5202; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
5203; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm11) | ymm10
5204; AVX512-NEXT:    movw $31, %ax
5205; AVX512-NEXT:    kmovw %eax, %k1
5206; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm2 {%k1}
5207; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm8
5208; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm10
5209; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7]
5210; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm8
5211; AVX512-NEXT:    vpshufb %xmm9, %xmm8, %xmm12
5212; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1]
5213; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7]
5214; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
5215; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7]
5216; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
5217; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm12
5218; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm13
5219; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
5220; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
5221; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7]
5222; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
5223; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
5224; AVX512-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
5225; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
5226; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5227; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
5228; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
5229; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
5230; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
5231; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
5232; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
5233; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
5234; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
5235; AVX512-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
5236; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
5237; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
5238; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
5239; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5240; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4
5241; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
5242; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
5243; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7]
5244; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3]
5245; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7]
5246; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5247; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
5248; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
5249; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
5250; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5251; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
5252; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
5253; AVX512-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
5254; AVX512-NEXT:    movw $-2048, %ax # imm = 0xF800
5255; AVX512-NEXT:    kmovw %eax, %k1
5256; AVX512-NEXT:    vmovdqa32 %zmm30, %zmm4 {%k1}
5257; AVX512-NEXT:    vmovdqa64 %zmm4, (%rsi)
5258; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload
5259; AVX512-NEXT:    # zmm15 = mem ^ (zmm3 & (zmm15 ^ mem))
5260; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
5261; AVX512-NEXT:    vmovdqa64 %zmm15, (%rdx)
5262; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5263; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm17 & (zmm10 ^ zmm2))
5264; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0))
5265; AVX512-NEXT:    vmovdqa64 %zmm16, (%rcx)
5266; AVX512-NEXT:    vmovdqa64 %zmm20, (%r8)
5267; AVX512-NEXT:    vmovdqa64 %zmm10, (%r9)
5268; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
5269; AVX512-NEXT:    addq $72, %rsp
5270; AVX512-NEXT:    vzeroupper
5271; AVX512-NEXT:    retq
5272;
5273; AVX512-FCP-LABEL: load_i16_stride6_vf32:
5274; AVX512-FCP:       # %bb.0:
5275; AVX512-FCP-NEXT:    subq $136, %rsp
5276; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
5277; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
5278; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm15
5279; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7]
5280; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm2
5281; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm3
5282; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm3[2,1,0,3]
5283; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm3
5284; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
5285; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
5286; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
5287; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm10
5288; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
5289; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
5290; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7]
5291; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
5292; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
5293; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm5
5294; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
5295; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm8
5296; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
5297; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
5298; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm4, %zmm5, %zmm4
5299; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5300; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7]
5301; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm16
5302; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
5303; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm8
5304; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
5305; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3]
5306; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
5307; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4,5],xmm8[6,7]
5308; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3]
5309; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm8
5310; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm0[1],ymm8[2,3,4,5],ymm0[6],ymm8[7]
5311; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm27
5312; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm28
5313; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
5314; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3,4,5,6,7]
5315; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5316; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
5317; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm8
5318; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
5319; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm19
5320; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
5321; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
5322; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm8
5323; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm9
5324; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7]
5325; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm10
5326; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm9
5327; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3]
5328; AVX512-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm9, %ymm14
5329; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7]
5330; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm24
5331; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm25
5332; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
5333; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
5334; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
5335; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
5336; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
5337; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
5338; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
5339; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm10
5340; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
5341; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
5342; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
5343; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
5344; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
5345; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5346; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm2, %zmm2
5347; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5348; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
5349; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
5350; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
5351; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
5352; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5353; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
5354; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm0
5355; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5356; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
5357; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5358; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
5359; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
5360; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
5361; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5362; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
5363; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7]
5364; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm30
5365; AVX512-FCP-NEXT:    vmovdqa64 %ymm15, %ymm31
5366; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
5367; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3]
5368; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7]
5369; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1]
5370; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
5371; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
5372; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
5373; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
5374; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
5375; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
5376; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
5377; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
5378; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
5379; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
5380; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
5381; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
5382; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5383; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm17
5384; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm23
5385; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
5386; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
5387; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
5388; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
5389; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
5390; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7]
5391; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
5392; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
5393; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
5394; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm14
5395; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm12
5396; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7]
5397; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
5398; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
5399; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5400; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm27
5401; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm15
5402; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm1
5403; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7]
5404; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm1, %xmm16
5405; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
5406; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
5407; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1]
5408; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4]
5409; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
5410; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm13
5411; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
5412; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm1
5413; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
5414; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
5415; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15]
5416; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
5417; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
5418; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16
5419; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5420; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm17 ^ (zmm0 & (zmm11 ^ zmm17))
5421; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
5422; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm11))
5423; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7]
5424; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
5425; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7]
5426; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
5427; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
5428; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
5429; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7]
5430; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5431; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm7, %zmm7
5432; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7]
5433; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
5434; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7]
5435; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm19
5436; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm28
5437; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
5438; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
5439; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5440; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
5441; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
5442; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
5443; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5444; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
5445; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
5446; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
5447; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5448; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7))
5449; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
5450; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm4))
5451; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
5452; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
5453; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
5454; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
5455; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm0
5456; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
5457; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
5458; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
5459; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
5460; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm3
5461; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
5462; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm6
5463; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
5464; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
5465; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm2
5466; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm7
5467; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
5468; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5469; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm2
5470; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
5471; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
5472; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm9
5473; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
5474; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
5475; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
5476; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7]
5477; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7]
5478; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5479; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
5480; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm11) | ymm10
5481; AVX512-FCP-NEXT:    movw $31, %ax
5482; AVX512-FCP-NEXT:    kmovw %eax, %k1
5483; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
5484; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm5
5485; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7]
5486; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm5
5487; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm13
5488; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
5489; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
5490; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm13[4],xmm8[5],xmm13[6,7]
5491; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5492; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm12
5493; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm13
5494; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
5495; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
5496; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7]
5497; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm8
5498; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
5499; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm9, %xmm9
5500; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
5501; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7]
5502; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5503; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm7 & ymm11)
5504; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
5505; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
5506; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
5507; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
5508; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
5509; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
5510; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
5511; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
5512; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm1
5513; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
5514; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm0
5515; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
5516; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7]
5517; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5518; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
5519; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
5520; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5521; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5522; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5523; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
5524; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
5525; AVX512-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
5526; AVX512-FCP-NEXT:    kmovw %eax, %k1
5527; AVX512-FCP-NEXT:    vmovdqa32 %zmm19, %zmm4 {%k1}
5528; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rsi)
5529; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
5530; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
5531; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
5532; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm4 {%k1}
5533; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rdx)
5534; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5535; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm2))
5536; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm17 & (zmm0 ^ zmm1))
5537; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
5538; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%r8)
5539; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%r9)
5540; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
5541; AVX512-FCP-NEXT:    addq $136, %rsp
5542; AVX512-FCP-NEXT:    vzeroupper
5543; AVX512-FCP-NEXT:    retq
5544;
5545; AVX512DQ-LABEL: load_i16_stride6_vf32:
5546; AVX512DQ:       # %bb.0:
5547; AVX512DQ-NEXT:    pushq %rax
5548; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
5549; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm13
5550; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm2
5551; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7]
5552; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm25
5553; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
5554; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm9
5555; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
5556; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
5557; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
5558; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm2
5559; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm5
5560; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm6
5561; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
5562; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm7
5563; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
5564; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm20
5565; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm22
5566; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
5567; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
5568; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3,4],xmm15[5,6,7]
5569; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
5570; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
5571; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5572; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm16
5573; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3]
5574; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
5575; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm18
5576; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm19
5577; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
5578; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm6
5579; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,0,3]
5580; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
5581; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7]
5582; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm4, %ymm12
5583; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
5584; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm28
5585; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
5586; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm5[3,4,5,6,7]
5587; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm0
5588; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm5
5589; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
5590; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm21
5591; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm23
5592; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm5
5593; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm1[2,2,2,2,4,5,6,7]
5594; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
5595; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
5596; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm8
5597; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm7
5598; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3]
5599; AVX512DQ-NEXT:    vinserti128 $1, 288(%rdi), %ymm7, %ymm11
5600; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7]
5601; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm24
5602; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm26
5603; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
5604; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
5605; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
5606; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
5607; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5608; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16))
5609; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
5610; AVX512DQ-NEXT:    kmovw %eax, %k1
5611; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
5612; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5613; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
5614; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm8
5615; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
5616; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3],xmm8[4,5],xmm3[6,7]
5617; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
5618; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm15, %xmm8
5619; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
5620; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3],xmm8[4,5],xmm14[6],xmm8[7]
5621; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5622; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm8, %zmm3
5623; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
5624; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
5625; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3],xmm6[4,5],xmm0[6,7]
5626; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
5627; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5628; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm5, %xmm0
5629; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5630; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
5631; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm17 & (zmm2 ^ zmm3))
5632; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5633; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
5634; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
5635; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
5636; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5637; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
5638; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5639; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
5640; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7]
5641; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm29
5642; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm30
5643; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
5644; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
5645; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
5646; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,3,4,5,6,7]
5647; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
5648; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
5649; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5650; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm1
5651; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
5652; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
5653; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
5654; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3]
5655; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7]
5656; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
5657; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
5658; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
5659; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
5660; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5661; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm16
5662; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm25
5663; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm13
5664; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm0
5665; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
5666; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
5667; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
5668; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
5669; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7]
5670; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
5671; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
5672; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5673; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm10
5674; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7]
5675; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
5676; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
5677; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5678; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
5679; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm1
5680; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
5681; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
5682; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
5683; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
5684; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
5685; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
5686; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,6,5,6,4]
5687; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7]
5688; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm11
5689; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
5690; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm2
5691; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7]
5692; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
5693; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
5694; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
5695; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
5696; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm18
5697; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5698; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16))
5699; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
5700; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9))
5701; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
5702; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
5703; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7]
5704; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
5705; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
5706; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm14[1,1,1,1,4,5,6,7]
5707; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
5708; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7]
5709; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5710; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm7
5711; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
5712; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
5713; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
5714; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7]
5715; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
5716; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7]
5717; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
5718; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
5719; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
5720; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
5721; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7]
5722; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5723; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
5724; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
5725; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
5726; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5727; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7))
5728; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
5729; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm4))
5730; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
5731; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm0
5732; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm1
5733; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
5734; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm0, %xmm2
5735; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
5736; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
5737; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
5738; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm2
5739; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm4
5740; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
5741; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm2
5742; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[0,3,2,1]
5743; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
5744; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
5745; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
5746; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm5
5747; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7]
5748; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
5749; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm4, %zmm4
5750; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
5751; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm5
5752; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7]
5753; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
5754; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm13
5755; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm13[2,2,2,2,4,5,6,7]
5756; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
5757; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5758; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
5759; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm14) | ymm8
5760; AVX512DQ-NEXT:    movw $31, %ax
5761; AVX512DQ-NEXT:    kmovw %eax, %k1
5762; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1}
5763; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm6
5764; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm8
5765; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
5766; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm6
5767; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm11
5768; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm8[0,3,2,1]
5769; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7]
5770; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
5771; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6,7]
5772; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5773; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm11
5774; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm10
5775; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7]
5776; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
5777; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7]
5778; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm8
5779; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
5780; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
5781; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
5782; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5783; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
5784; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
5785; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
5786; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
5787; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
5788; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7]
5789; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5790; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
5791; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm5, %xmm2
5792; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,3]
5793; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
5794; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7]
5795; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5796; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm14) | ymm3
5797; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1}
5798; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
5799; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
5800; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
5801; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
5802; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5803; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
5804; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
5805; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
5806; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5807; AVX512DQ-NEXT:    vmovaps %zmm2, (%rsi)
5808; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5809; AVX512DQ-NEXT:    vmovaps %zmm2, (%rdx)
5810; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5811; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm4))
5812; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0))
5813; AVX512DQ-NEXT:    vmovdqa64 %zmm18, (%rcx)
5814; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%r8)
5815; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%r9)
5816; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
5817; AVX512DQ-NEXT:    popq %rax
5818; AVX512DQ-NEXT:    vzeroupper
5819; AVX512DQ-NEXT:    retq
5820;
5821; AVX512DQ-FCP-LABEL: load_i16_stride6_vf32:
5822; AVX512DQ-FCP:       # %bb.0:
5823; AVX512DQ-FCP-NEXT:    pushq %rax
5824; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
5825; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
5826; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
5827; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7]
5828; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
5829; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
5830; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm2
5831; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
5832; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
5833; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
5834; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
5835; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
5836; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm10
5837; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
5838; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
5839; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
5840; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm22
5841; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm25
5842; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
5843; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm4
5844; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm7
5845; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm8
5846; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4,5],xmm4[6],xmm8[7]
5847; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
5848; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm16
5849; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7]
5850; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm18
5851; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
5852; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
5853; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm1
5854; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
5855; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
5856; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7]
5857; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3]
5858; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm12
5859; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
5860; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
5861; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
5862; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm4[3,4,5,6,7]
5863; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
5864; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm1
5865; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
5866; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
5867; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
5868; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm11
5869; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm1
5870; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
5871; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7]
5872; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm8
5873; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm6
5874; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3]
5875; AVX512DQ-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm6, %ymm11
5876; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7]
5877; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm23
5878; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm26
5879; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
5880; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
5881; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
5882; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
5883; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5884; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16))
5885; AVX512DQ-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
5886; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
5887; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
5888; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5889; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
5890; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
5891; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm8
5892; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3],xmm8[4,5],xmm5[6,7]
5893; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
5894; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
5895; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,5,5,5,5]
5896; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7]
5897; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5898; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm7, %zmm5
5899; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm7
5900; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
5901; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6,7]
5902; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
5903; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5904; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm0
5905; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
5906; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
5907; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm17 & (zmm2 ^ zmm5))
5908; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5909; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
5910; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
5911; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
5912; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5913; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
5914; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5915; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
5916; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7]
5917; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm29
5918; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm30
5919; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
5920; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
5921; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
5922; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
5923; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
5924; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5925; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm0
5926; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
5927; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
5928; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
5929; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
5930; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
5931; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm3
5932; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
5933; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
5934; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
5935; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5936; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm16
5937; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm24
5938; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm13
5939; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
5940; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7]
5941; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
5942; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
5943; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7]
5944; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
5945; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
5946; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
5947; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm10
5948; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7]
5949; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
5950; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
5951; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm6[4,5,6,7]
5952; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
5953; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm2
5954; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
5955; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm1, %xmm17
5956; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3]
5957; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
5958; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm17[0,1,2,1]
5959; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4]
5960; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7]
5961; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm11
5962; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
5963; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
5964; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7]
5965; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
5966; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
5967; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
5968; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
5969; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm18
5970; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5971; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16))
5972; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
5973; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9))
5974; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
5975; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
5976; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
5977; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
5978; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm11
5979; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
5980; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5,6],xmm7[7]
5981; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5982; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm7
5983; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7]
5984; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
5985; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7]
5986; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
5987; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
5988; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5989; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm4
5990; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
5991; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7]
5992; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5993; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
5994; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
5995; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
5996; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5997; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm0 & (zmm3 ^ zmm7))
5998; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
5999; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm3))
6000; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
6001; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
6002; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
6003; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
6004; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm0
6005; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm1
6006; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,2,2,4,5,6,7]
6007; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
6008; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
6009; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm4
6010; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
6011; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
6012; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1]
6013; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
6014; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
6015; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
6016; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7]
6017; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6018; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
6019; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
6020; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm8
6021; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
6022; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
6023; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
6024; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7]
6025; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
6026; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6027; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6028; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm13) | ymm11
6029; AVX512DQ-FCP-NEXT:    movw $31, %ax
6030; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
6031; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1}
6032; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm5
6033; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm11
6034; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7]
6035; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm5
6036; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm14
6037; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
6038; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
6039; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7]
6040; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
6041; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm14
6042; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm10
6043; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4,5],ymm10[6],ymm14[7]
6044; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
6045; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7]
6046; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
6047; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
6048; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm9, %xmm9
6049; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
6050; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7]
6051; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6052; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm8 & ymm13)
6053; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
6054; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
6055; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7]
6056; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
6057; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
6058; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
6059; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
6060; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
6061; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm1
6062; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm1 {%k1}
6063; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
6064; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
6065; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
6066; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6067; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm14[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
6068; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
6069; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
6070; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6071; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rsi)
6072; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6073; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rdx)
6074; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6075; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm17 & (zmm7 ^ zmm0))
6076; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm17 & (zmm2 ^ zmm1))
6077; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%rcx)
6078; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
6079; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%r9)
6080; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
6081; AVX512DQ-FCP-NEXT:    popq %rax
6082; AVX512DQ-FCP-NEXT:    vzeroupper
6083; AVX512DQ-FCP-NEXT:    retq
6084;
6085; AVX512BW-LABEL: load_i16_stride6_vf32:
6086; AVX512BW:       # %bb.0:
6087; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6088; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
6089; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
6090; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
6091; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
6092; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
6093; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
6094; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
6095; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
6096; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm7
6097; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
6098; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6099; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm8
6100; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
6101; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm6
6102; AVX512BW-NEXT:    movl $4192256, %edi # imm = 0x3FF800
6103; AVX512BW-NEXT:    kmovd %edi, %k1
6104; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm6 {%k1}
6105; AVX512BW-NEXT:    movw $-2048, %di # imm = 0xF800
6106; AVX512BW-NEXT:    kmovd %edi, %k2
6107; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm6 {%k2}
6108; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
6109; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6110; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6111; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
6112; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6113; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm9
6114; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
6115; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm7
6116; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm7 {%k1}
6117; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k2}
6118; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
6119; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6120; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6121; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
6122; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6123; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6124; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
6125; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm5, %zmm10
6126; AVX512BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
6127; AVX512BW-NEXT:    kmovd %edi, %k2
6128; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm10 {%k2}
6129; AVX512BW-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
6130; AVX512BW-NEXT:    kmovd %edi, %k1
6131; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm10 {%k1}
6132; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
6133; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6134; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6135; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
6136; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6137; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6138; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
6139; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm5, %zmm11
6140; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm11 {%k2}
6141; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm11 {%k1}
6142; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
6143; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
6144; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
6145; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6146; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6147; AVX512BW-NEXT:    movw $31, %di
6148; AVX512BW-NEXT:    kmovd %edi, %k2
6149; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
6150; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
6151; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6152; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
6153; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
6154; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
6155; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
6156; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
6157; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
6158; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm3
6159; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm3 {%k2}
6160; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
6161; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
6162; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
6163; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
6164; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rsi)
6165; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rdx)
6166; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rcx)
6167; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%r8)
6168; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%r9)
6169; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
6170; AVX512BW-NEXT:    vzeroupper
6171; AVX512BW-NEXT:    retq
6172;
6173; AVX512BW-FCP-LABEL: load_i16_stride6_vf32:
6174; AVX512BW-FCP:       # %bb.0:
6175; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6176; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
6177; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
6178; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
6179; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
6180; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
6181; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
6182; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
6183; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
6184; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm7
6185; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
6186; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6187; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm4, %zmm8
6188; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
6189; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm3, %zmm6
6190; AVX512BW-FCP-NEXT:    movl $4192256, %edi # imm = 0x3FF800
6191; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
6192; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm6 {%k1}
6193; AVX512BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
6194; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
6195; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm6 {%k2}
6196; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
6197; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6198; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6199; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
6200; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6201; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm4, %zmm9
6202; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
6203; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm3, %zmm7
6204; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm7 {%k1}
6205; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k2}
6206; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
6207; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6208; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6209; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
6210; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6211; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6212; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
6213; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm5, %zmm10
6214; AVX512BW-FCP-NEXT:    movl $2095104, %edi # imm = 0x1FF800
6215; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
6216; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm10 {%k2}
6217; AVX512BW-FCP-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
6218; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
6219; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm10 {%k1}
6220; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
6221; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6222; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6223; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
6224; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6225; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6226; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
6227; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm5, %zmm11
6228; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm11 {%k2}
6229; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm11 {%k1}
6230; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
6231; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
6232; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
6233; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6234; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6235; AVX512BW-FCP-NEXT:    movw $31, %di
6236; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
6237; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
6238; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
6239; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6240; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
6241; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
6242; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
6243; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
6244; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
6245; AVX512BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
6246; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm2, %zmm3
6247; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm3 {%k2}
6248; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
6249; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
6250; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
6251; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
6252; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rsi)
6253; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
6254; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rcx)
6255; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, (%r8)
6256; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%r9)
6257; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
6258; AVX512BW-FCP-NEXT:    vzeroupper
6259; AVX512BW-FCP-NEXT:    retq
6260;
6261; AVX512DQ-BW-LABEL: load_i16_stride6_vf32:
6262; AVX512DQ-BW:       # %bb.0:
6263; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6264; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm3
6265; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
6266; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
6267; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
6268; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
6269; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
6270; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
6271; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
6272; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm7
6273; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
6274; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6275; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm8
6276; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
6277; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm6
6278; AVX512DQ-BW-NEXT:    movl $4192256, %edi # imm = 0x3FF800
6279; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
6280; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm6 {%k1}
6281; AVX512DQ-BW-NEXT:    movw $-2048, %di # imm = 0xF800
6282; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
6283; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm7, %zmm6 {%k2}
6284; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
6285; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6286; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6287; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
6288; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6289; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm9
6290; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
6291; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm7
6292; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm7 {%k1}
6293; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k2}
6294; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
6295; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6296; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6297; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
6298; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6299; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6300; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
6301; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm5, %zmm10
6302; AVX512DQ-BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
6303; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
6304; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm10 {%k2}
6305; AVX512DQ-BW-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
6306; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
6307; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm10 {%k1}
6308; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
6309; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6310; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6311; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
6312; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6313; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6314; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
6315; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm5, %zmm11
6316; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm11 {%k2}
6317; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm11 {%k1}
6318; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
6319; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
6320; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
6321; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6322; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6323; AVX512DQ-BW-NEXT:    movw $31, %di
6324; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
6325; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
6326; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
6327; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6328; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
6329; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
6330; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
6331; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
6332; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
6333; AVX512DQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
6334; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm3
6335; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm8, %zmm3 {%k2}
6336; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
6337; AVX512DQ-BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
6338; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
6339; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
6340; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, (%rsi)
6341; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rdx)
6342; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%rcx)
6343; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, (%r8)
6344; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%r9)
6345; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%rax)
6346; AVX512DQ-BW-NEXT:    vzeroupper
6347; AVX512DQ-BW-NEXT:    retq
6348;
6349; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf32:
6350; AVX512DQ-BW-FCP:       # %bb.0:
6351; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6352; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
6353; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
6354; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
6355; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
6356; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
6357; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm1
6358; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
6359; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
6360; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm7
6361; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
6362; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6363; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm4, %zmm8
6364; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
6365; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm3, %zmm6
6366; AVX512DQ-BW-FCP-NEXT:    movl $4192256, %edi # imm = 0x3FF800
6367; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
6368; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm6 {%k1}
6369; AVX512DQ-BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
6370; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
6371; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm6 {%k2}
6372; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
6373; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6374; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6375; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
6376; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6377; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm4, %zmm9
6378; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
6379; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm3, %zmm7
6380; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm7 {%k1}
6381; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k2}
6382; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
6383; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6384; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6385; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
6386; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6387; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6388; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
6389; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm5, %zmm10
6390; AVX512DQ-BW-FCP-NEXT:    movl $2095104, %edi # imm = 0x1FF800
6391; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
6392; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm10 {%k2}
6393; AVX512DQ-BW-FCP-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
6394; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
6395; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm10 {%k1}
6396; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
6397; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6398; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm8
6399; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
6400; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6401; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6402; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
6403; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm5, %zmm11
6404; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm11 {%k2}
6405; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm11 {%k1}
6406; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
6407; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
6408; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
6409; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
6410; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
6411; AVX512DQ-BW-FCP-NEXT:    movw $31, %di
6412; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
6413; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k2}
6414; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
6415; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
6416; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
6417; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
6418; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
6419; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
6420; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
6421; AVX512DQ-BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
6422; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm2, %zmm3
6423; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm3 {%k2}
6424; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
6425; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
6426; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
6427; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
6428; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rsi)
6429; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
6430; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rcx)
6431; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, (%r8)
6432; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%r9)
6433; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
6434; AVX512DQ-BW-FCP-NEXT:    vzeroupper
6435; AVX512DQ-BW-FCP-NEXT:    retq
6436  %wide.vec = load <192 x i16>, ptr %in.vec, align 64
6437  %strided.vec0 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186>
6438  %strided.vec1 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187>
6439  %strided.vec2 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188>
6440  %strided.vec3 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189>
6441  %strided.vec4 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190>
6442  %strided.vec5 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191>
6443  store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
6444  store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
6445  store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
6446  store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
6447  store <32 x i16> %strided.vec4, ptr %out.vec4, align 64
6448  store <32 x i16> %strided.vec5, ptr %out.vec5, align 64
6449  ret void
6450}
6451
6452define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
6453; SSE-LABEL: load_i16_stride6_vf64:
6454; SSE:       # %bb.0:
6455; SSE-NEXT:    subq $1176, %rsp # imm = 0x498
6456; SSE-NEXT:    movdqa 496(%rdi), %xmm5
6457; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6458; SSE-NEXT:    movdqa 512(%rdi), %xmm8
6459; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6460; SSE-NEXT:    movdqa 144(%rdi), %xmm7
6461; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6462; SSE-NEXT:    movdqa 160(%rdi), %xmm3
6463; SSE-NEXT:    movdqa 176(%rdi), %xmm0
6464; SSE-NEXT:    movdqa 112(%rdi), %xmm6
6465; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6466; SSE-NEXT:    movdqa 96(%rdi), %xmm4
6467; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6468; SSE-NEXT:    movdqa 128(%rdi), %xmm1
6469; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6470; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7]
6471; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535]
6472; SSE-NEXT:    movdqa %xmm9, %xmm2
6473; SSE-NEXT:    pandn %xmm1, %xmm2
6474; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3]
6475; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6476; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
6477; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
6478; SSE-NEXT:    pand %xmm9, %xmm1
6479; SSE-NEXT:    por %xmm2, %xmm1
6480; SSE-NEXT:    movdqa %xmm1, %xmm2
6481; SSE-NEXT:    movdqa %xmm3, %xmm1
6482; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3]
6483; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
6484; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6485; SSE-NEXT:    movdqa %xmm0, %xmm4
6486; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
6487; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6488; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6489; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
6490; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
6491; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6492; SSE-NEXT:    pslld $16, %xmm0
6493; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6494; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6495; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,3,2,3]
6496; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6497; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6498; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3]
6499; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
6500; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6501; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,2,4,5,6,7]
6502; SSE-NEXT:    movdqa %xmm9, %xmm1
6503; SSE-NEXT:    pandn %xmm0, %xmm1
6504; SSE-NEXT:    movdqa 480(%rdi), %xmm0
6505; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6506; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6507; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6508; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6509; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
6510; SSE-NEXT:    pand %xmm9, %xmm0
6511; SSE-NEXT:    por %xmm1, %xmm0
6512; SSE-NEXT:    movdqa %xmm0, %xmm2
6513; SSE-NEXT:    movdqa 544(%rdi), %xmm3
6514; SSE-NEXT:    movdqa 560(%rdi), %xmm1
6515; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
6516; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6517; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6518; SSE-NEXT:    movdqa %xmm1, %xmm0
6519; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0]
6520; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6521; SSE-NEXT:    movdqa %xmm3, %xmm0
6522; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6523; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
6524; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
6525; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6526; SSE-NEXT:    pslld $16, %xmm1
6527; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6528; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6529; SSE-NEXT:    movdqa 528(%rdi), %xmm1
6530; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6531; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
6532; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6533; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
6534; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3]
6535; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
6536; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6537; SSE-NEXT:    movdqa 32(%rdi), %xmm0
6538; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6539; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6540; SSE-NEXT:    movdqa %xmm9, %xmm1
6541; SSE-NEXT:    pandn %xmm0, %xmm1
6542; SSE-NEXT:    movdqa (%rdi), %xmm0
6543; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6544; SSE-NEXT:    movdqa 16(%rdi), %xmm6
6545; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6546; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6547; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6548; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
6549; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6550; SSE-NEXT:    pand %xmm9, %xmm0
6551; SSE-NEXT:    por %xmm1, %xmm0
6552; SSE-NEXT:    movdqa %xmm0, %xmm2
6553; SSE-NEXT:    movdqa 64(%rdi), %xmm3
6554; SSE-NEXT:    movdqa 80(%rdi), %xmm0
6555; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
6556; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6557; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6558; SSE-NEXT:    movdqa %xmm0, %xmm1
6559; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
6560; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6561; SSE-NEXT:    movdqa %xmm3, %xmm1
6562; SSE-NEXT:    movaps %xmm3, (%rsp) # 16-byte Spill
6563; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
6564; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
6565; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6566; SSE-NEXT:    pslld $16, %xmm0
6567; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6568; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6569; SSE-NEXT:    movdqa 48(%rdi), %xmm0
6570; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6571; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6572; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6573; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6574; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3]
6575; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
6576; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6577; SSE-NEXT:    movdqa 416(%rdi), %xmm0
6578; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6579; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6580; SSE-NEXT:    movdqa %xmm9, %xmm1
6581; SSE-NEXT:    pandn %xmm0, %xmm1
6582; SSE-NEXT:    movdqa 400(%rdi), %xmm2
6583; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6584; SSE-NEXT:    movdqa 384(%rdi), %xmm0
6585; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6586; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6587; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6588; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6589; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6590; SSE-NEXT:    pand %xmm9, %xmm0
6591; SSE-NEXT:    por %xmm1, %xmm0
6592; SSE-NEXT:    movdqa %xmm0, %xmm2
6593; SSE-NEXT:    movdqa 448(%rdi), %xmm3
6594; SSE-NEXT:    movdqa 464(%rdi), %xmm0
6595; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3]
6596; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
6597; SSE-NEXT:    movdqa %xmm0, %xmm1
6598; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
6599; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6600; SSE-NEXT:    movdqa %xmm3, %xmm1
6601; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6602; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
6603; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
6604; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6605; SSE-NEXT:    pslld $16, %xmm0
6606; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6607; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6608; SSE-NEXT:    movdqa 432(%rdi), %xmm0
6609; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6610; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6611; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6612; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6613; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3]
6614; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
6615; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6616; SSE-NEXT:    movdqa 320(%rdi), %xmm0
6617; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6618; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6619; SSE-NEXT:    movdqa %xmm9, %xmm1
6620; SSE-NEXT:    pandn %xmm0, %xmm1
6621; SSE-NEXT:    movdqa 304(%rdi), %xmm2
6622; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6623; SSE-NEXT:    movdqa 288(%rdi), %xmm0
6624; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6625; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6626; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6627; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6628; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6629; SSE-NEXT:    pand %xmm9, %xmm0
6630; SSE-NEXT:    por %xmm1, %xmm0
6631; SSE-NEXT:    movdqa %xmm0, %xmm2
6632; SSE-NEXT:    movdqa 352(%rdi), %xmm3
6633; SSE-NEXT:    movdqa 368(%rdi), %xmm0
6634; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,2,3,3]
6635; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
6636; SSE-NEXT:    movdqa %xmm0, %xmm1
6637; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
6638; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6639; SSE-NEXT:    movdqa %xmm3, %xmm1
6640; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6641; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
6642; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
6643; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6644; SSE-NEXT:    pslld $16, %xmm0
6645; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6646; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6647; SSE-NEXT:    movdqa 336(%rdi), %xmm0
6648; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6649; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6650; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6651; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6652; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3]
6653; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
6654; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6655; SSE-NEXT:    movdqa 704(%rdi), %xmm0
6656; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6657; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6658; SSE-NEXT:    movdqa %xmm9, %xmm1
6659; SSE-NEXT:    pandn %xmm0, %xmm1
6660; SSE-NEXT:    movdqa 688(%rdi), %xmm2
6661; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6662; SSE-NEXT:    movdqa 672(%rdi), %xmm0
6663; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6664; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6665; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6666; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6667; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6668; SSE-NEXT:    pand %xmm9, %xmm0
6669; SSE-NEXT:    por %xmm1, %xmm0
6670; SSE-NEXT:    movdqa %xmm0, %xmm2
6671; SSE-NEXT:    movdqa 736(%rdi), %xmm3
6672; SSE-NEXT:    movdqa 752(%rdi), %xmm1
6673; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm3[2,2,3,3]
6674; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
6675; SSE-NEXT:    movdqa %xmm1, %xmm0
6676; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0]
6677; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6678; SSE-NEXT:    movdqa %xmm3, %xmm0
6679; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6680; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
6681; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
6682; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6683; SSE-NEXT:    pslld $16, %xmm1
6684; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6685; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6686; SSE-NEXT:    movdqa 720(%rdi), %xmm1
6687; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6688; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
6689; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6690; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
6691; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3]
6692; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
6693; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6694; SSE-NEXT:    movdqa 224(%rdi), %xmm0
6695; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6696; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6697; SSE-NEXT:    movdqa %xmm9, %xmm1
6698; SSE-NEXT:    pandn %xmm0, %xmm1
6699; SSE-NEXT:    movdqa 208(%rdi), %xmm2
6700; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6701; SSE-NEXT:    movdqa 192(%rdi), %xmm0
6702; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6703; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6704; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6705; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6706; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6707; SSE-NEXT:    pand %xmm9, %xmm0
6708; SSE-NEXT:    por %xmm1, %xmm0
6709; SSE-NEXT:    movdqa %xmm0, %xmm1
6710; SSE-NEXT:    movdqa 256(%rdi), %xmm4
6711; SSE-NEXT:    movdqa 272(%rdi), %xmm0
6712; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3]
6713; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
6714; SSE-NEXT:    movdqa %xmm0, %xmm2
6715; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0]
6716; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6717; SSE-NEXT:    movdqa %xmm4, %xmm2
6718; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6719; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0]
6720; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3]
6721; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6722; SSE-NEXT:    pslld $16, %xmm0
6723; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6724; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
6725; SSE-NEXT:    movdqa 240(%rdi), %xmm0
6726; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6727; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6728; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6729; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6730; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3]
6731; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
6732; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6733; SSE-NEXT:    movdqa 608(%rdi), %xmm0
6734; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6735; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6736; SSE-NEXT:    movdqa %xmm9, %xmm2
6737; SSE-NEXT:    pandn %xmm0, %xmm2
6738; SSE-NEXT:    movdqa 592(%rdi), %xmm13
6739; SSE-NEXT:    movdqa 576(%rdi), %xmm0
6740; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6741; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[0,1,0,3]
6742; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7]
6743; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3]
6744; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6745; SSE-NEXT:    pand %xmm9, %xmm0
6746; SSE-NEXT:    por %xmm2, %xmm0
6747; SSE-NEXT:    movdqa %xmm0, %xmm1
6748; SSE-NEXT:    movdqa 640(%rdi), %xmm5
6749; SSE-NEXT:    movdqa 656(%rdi), %xmm2
6750; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
6751; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
6752; SSE-NEXT:    movdqa %xmm2, %xmm0
6753; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0]
6754; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6755; SSE-NEXT:    movdqa %xmm5, %xmm0
6756; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6757; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0]
6758; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3]
6759; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6760; SSE-NEXT:    pslld $16, %xmm2
6761; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6762; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6763; SSE-NEXT:    movdqa 624(%rdi), %xmm2
6764; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6765; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3]
6766; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm7[0,1,0,2,4,5,6,7]
6767; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[1,3]
6768; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
6769; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6770; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6771; SSE-NEXT:    movdqa %xmm11, %xmm0
6772; SSE-NEXT:    psrld $16, %xmm0
6773; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6774; SSE-NEXT:    # xmm1 = mem[0,1,2,3,5,7,6,7]
6775; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6776; SSE-NEXT:    movdqa %xmm9, %xmm0
6777; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6778; SSE-NEXT:    pandn %xmm2, %xmm0
6779; SSE-NEXT:    pand %xmm9, %xmm1
6780; SSE-NEXT:    por %xmm0, %xmm1
6781; SSE-NEXT:    pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6782; SSE-NEXT:    # xmm0 = mem[0,1,1,3,4,5,6,7]
6783; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6784; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[3,1],xmm0[1,3]
6785; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0]
6786; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6787; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6788; SSE-NEXT:    psrld $16, %xmm0
6789; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6790; SSE-NEXT:    # xmm1 = mem[0,1,2,3,5,7,6,7]
6791; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6792; SSE-NEXT:    movdqa %xmm9, %xmm0
6793; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6794; SSE-NEXT:    pand %xmm9, %xmm1
6795; SSE-NEXT:    por %xmm0, %xmm1
6796; SSE-NEXT:    pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6797; SSE-NEXT:    # xmm0 = mem[0,1,1,3,4,5,6,7]
6798; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6799; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3]
6800; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
6801; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6802; SSE-NEXT:    movdqa %xmm6, %xmm0
6803; SSE-NEXT:    psrld $16, %xmm0
6804; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6805; SSE-NEXT:    # xmm1 = mem[0,1,2,3,5,7,6,7]
6806; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6807; SSE-NEXT:    movdqa %xmm9, %xmm0
6808; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6809; SSE-NEXT:    pandn %xmm6, %xmm0
6810; SSE-NEXT:    pand %xmm9, %xmm1
6811; SSE-NEXT:    por %xmm0, %xmm1
6812; SSE-NEXT:    pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6813; SSE-NEXT:    # xmm0 = mem[0,1,1,3,4,5,6,7]
6814; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6815; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3]
6816; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
6817; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6818; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6819; SSE-NEXT:    psrld $16, %xmm0
6820; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6821; SSE-NEXT:    # xmm1 = mem[0,1,2,3,5,7,6,7]
6822; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6823; SSE-NEXT:    movdqa %xmm9, %xmm0
6824; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6825; SSE-NEXT:    pand %xmm9, %xmm1
6826; SSE-NEXT:    por %xmm0, %xmm1
6827; SSE-NEXT:    pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6828; SSE-NEXT:    # xmm0 = mem[0,1,1,3,4,5,6,7]
6829; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[1,3]
6830; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0]
6831; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6832; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6833; SSE-NEXT:    movdqa %xmm12, %xmm0
6834; SSE-NEXT:    psrld $16, %xmm0
6835; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6836; SSE-NEXT:    # xmm1 = mem[0,1,2,3,5,7,6,7]
6837; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6838; SSE-NEXT:    movdqa %xmm9, %xmm0
6839; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6840; SSE-NEXT:    pandn %xmm5, %xmm0
6841; SSE-NEXT:    pand %xmm9, %xmm1
6842; SSE-NEXT:    por %xmm0, %xmm1
6843; SSE-NEXT:    pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6844; SSE-NEXT:    # xmm0 = mem[0,1,1,3,4,5,6,7]
6845; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3]
6846; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0]
6847; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6848; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6849; SSE-NEXT:    psrld $16, %xmm0
6850; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6851; SSE-NEXT:    # xmm1 = mem[0,1,2,3,5,7,6,7]
6852; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6853; SSE-NEXT:    movdqa %xmm9, %xmm0
6854; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6855; SSE-NEXT:    pand %xmm9, %xmm1
6856; SSE-NEXT:    por %xmm0, %xmm1
6857; SSE-NEXT:    pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6858; SSE-NEXT:    # xmm0 = mem[0,1,1,3,4,5,6,7]
6859; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[3,1],xmm0[1,3]
6860; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0]
6861; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6862; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6863; SSE-NEXT:    movdqa %xmm15, %xmm0
6864; SSE-NEXT:    psrld $16, %xmm0
6865; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6866; SSE-NEXT:    # xmm1 = mem[0,1,2,3,5,7,6,7]
6867; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6868; SSE-NEXT:    movdqa %xmm9, %xmm0
6869; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6870; SSE-NEXT:    pandn %xmm10, %xmm0
6871; SSE-NEXT:    pand %xmm9, %xmm1
6872; SSE-NEXT:    por %xmm0, %xmm1
6873; SSE-NEXT:    pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6874; SSE-NEXT:    # xmm0 = mem[0,1,1,3,4,5,6,7]
6875; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3]
6876; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
6877; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6878; SSE-NEXT:    psrld $16, %xmm13
6879; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7]
6880; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3]
6881; SSE-NEXT:    pand %xmm9, %xmm3
6882; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
6883; SSE-NEXT:    por %xmm3, %xmm9
6884; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7]
6885; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3]
6886; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0]
6887; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6888; SSE-NEXT:    movdqa %xmm2, %xmm0
6889; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6890; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6891; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
6892; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6893; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535]
6894; SSE-NEXT:    movdqa %xmm4, %xmm2
6895; SSE-NEXT:    pandn %xmm0, %xmm2
6896; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6897; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0]
6898; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3]
6899; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
6900; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6901; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7]
6902; SSE-NEXT:    pand %xmm4, %xmm3
6903; SSE-NEXT:    por %xmm2, %xmm3
6904; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6905; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6906; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
6907; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6908; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6909; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
6910; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4]
6911; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0]
6912; SSE-NEXT:    movdqa %xmm14, %xmm0
6913; SSE-NEXT:    pandn %xmm2, %xmm0
6914; SSE-NEXT:    pand %xmm14, %xmm3
6915; SSE-NEXT:    por %xmm3, %xmm0
6916; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6917; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6918; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6919; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
6920; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
6921; SSE-NEXT:    movdqa %xmm4, %xmm3
6922; SSE-NEXT:    pandn %xmm6, %xmm3
6923; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6924; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6925; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0]
6926; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3]
6927; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7]
6928; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6929; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
6930; SSE-NEXT:    pand %xmm4, %xmm2
6931; SSE-NEXT:    por %xmm3, %xmm2
6932; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
6933; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6934; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
6935; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
6936; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
6937; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
6938; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
6939; SSE-NEXT:    movdqa %xmm14, %xmm0
6940; SSE-NEXT:    pandn %xmm3, %xmm0
6941; SSE-NEXT:    pand %xmm14, %xmm2
6942; SSE-NEXT:    por %xmm2, %xmm0
6943; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6944; SSE-NEXT:    movdqa %xmm5, %xmm2
6945; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6946; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
6947; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
6948; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
6949; SSE-NEXT:    movdqa %xmm4, %xmm5
6950; SSE-NEXT:    pandn %xmm2, %xmm5
6951; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6952; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0]
6953; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3]
6954; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
6955; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6956; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
6957; SSE-NEXT:    pand %xmm4, %xmm2
6958; SSE-NEXT:    por %xmm5, %xmm2
6959; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6960; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6961; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
6962; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6963; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
6964; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
6965; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
6966; SSE-NEXT:    movdqa %xmm14, %xmm0
6967; SSE-NEXT:    pandn %xmm5, %xmm0
6968; SSE-NEXT:    pand %xmm14, %xmm2
6969; SSE-NEXT:    por %xmm2, %xmm0
6970; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6971; SSE-NEXT:    movdqa %xmm10, %xmm2
6972; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
6973; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
6974; SSE-NEXT:    # xmm5 = mem[1,1,1,1]
6975; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
6976; SSE-NEXT:    movdqa %xmm4, %xmm5
6977; SSE-NEXT:    pandn %xmm2, %xmm5
6978; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6979; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[1,0],xmm15[0,0]
6980; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[2,0],xmm15[2,3]
6981; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7]
6982; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6983; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
6984; SSE-NEXT:    pand %xmm4, %xmm2
6985; SSE-NEXT:    por %xmm5, %xmm2
6986; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6987; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6988; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
6989; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6990; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
6991; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
6992; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
6993; SSE-NEXT:    movdqa %xmm14, %xmm0
6994; SSE-NEXT:    pandn %xmm5, %xmm0
6995; SSE-NEXT:    pand %xmm14, %xmm2
6996; SSE-NEXT:    por %xmm2, %xmm0
6997; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6998; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6999; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7000; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
7001; SSE-NEXT:    # xmm5 = mem[1,1,1,1]
7002; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
7003; SSE-NEXT:    movdqa %xmm4, %xmm5
7004; SSE-NEXT:    pandn %xmm2, %xmm5
7005; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7006; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7007; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
7008; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
7009; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7]
7010; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
7011; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7]
7012; SSE-NEXT:    pand %xmm4, %xmm6
7013; SSE-NEXT:    por %xmm5, %xmm6
7014; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7015; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7016; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
7017; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7018; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
7019; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
7020; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
7021; SSE-NEXT:    movdqa %xmm14, %xmm0
7022; SSE-NEXT:    pandn %xmm5, %xmm0
7023; SSE-NEXT:    pand %xmm14, %xmm6
7024; SSE-NEXT:    por %xmm6, %xmm0
7025; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7026; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7027; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7028; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7029; SSE-NEXT:    # xmm6 = mem[1,1,1,1]
7030; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
7031; SSE-NEXT:    movdqa %xmm4, %xmm6
7032; SSE-NEXT:    pandn %xmm5, %xmm6
7033; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7034; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7035; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0]
7036; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,3]
7037; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7]
7038; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
7039; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7]
7040; SSE-NEXT:    pand %xmm4, %xmm5
7041; SSE-NEXT:    por %xmm6, %xmm5
7042; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7043; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7044; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
7045; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7046; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7]
7047; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0]
7048; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
7049; SSE-NEXT:    movdqa %xmm14, %xmm0
7050; SSE-NEXT:    pandn %xmm6, %xmm0
7051; SSE-NEXT:    pand %xmm14, %xmm5
7052; SSE-NEXT:    por %xmm5, %xmm0
7053; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7054; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7055; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7056; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7057; SSE-NEXT:    # xmm6 = mem[1,1,1,1]
7058; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
7059; SSE-NEXT:    movdqa %xmm4, %xmm6
7060; SSE-NEXT:    pandn %xmm5, %xmm6
7061; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7062; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7063; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0]
7064; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3]
7065; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
7066; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
7067; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7]
7068; SSE-NEXT:    pand %xmm4, %xmm7
7069; SSE-NEXT:    por %xmm6, %xmm7
7070; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7071; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7072; SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,2]
7073; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7074; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7]
7075; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0]
7076; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
7077; SSE-NEXT:    movdqa %xmm14, %xmm0
7078; SSE-NEXT:    pandn %xmm6, %xmm0
7079; SSE-NEXT:    pand %xmm14, %xmm7
7080; SSE-NEXT:    por %xmm7, %xmm0
7081; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7082; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7083; SSE-NEXT:    movdqa %xmm13, %xmm6
7084; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7085; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7086; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
7087; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
7088; SSE-NEXT:    movdqa %xmm4, %xmm12
7089; SSE-NEXT:    pandn %xmm6, %xmm12
7090; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7091; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7092; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0]
7093; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3]
7094; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7]
7095; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
7096; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7]
7097; SSE-NEXT:    pand %xmm4, %xmm6
7098; SSE-NEXT:    por %xmm12, %xmm6
7099; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7100; SSE-NEXT:    shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
7101; SSE-NEXT:    # xmm12 = xmm12[0,1],mem[0,2]
7102; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7103; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
7104; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0]
7105; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
7106; SSE-NEXT:    movdqa %xmm14, %xmm15
7107; SSE-NEXT:    pandn %xmm12, %xmm15
7108; SSE-NEXT:    pand %xmm14, %xmm6
7109; SSE-NEXT:    por %xmm6, %xmm15
7110; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7111; SSE-NEXT:    psrlq $48, %xmm9
7112; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
7113; SSE-NEXT:    # xmm12 = mem[2,2,3,3]
7114; SSE-NEXT:    punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm9[0]
7115; SSE-NEXT:    movdqa %xmm4, %xmm6
7116; SSE-NEXT:    pandn %xmm12, %xmm6
7117; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
7118; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
7119; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
7120; SSE-NEXT:    pand %xmm4, %xmm8
7121; SSE-NEXT:    por %xmm6, %xmm8
7122; SSE-NEXT:    pshufhw $231, (%rsp), %xmm6 # 16-byte Folded Reload
7123; SSE-NEXT:    # xmm6 = mem[0,1,2,3,7,5,6,7]
7124; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
7125; SSE-NEXT:    movdqa %xmm14, %xmm12
7126; SSE-NEXT:    pandn %xmm6, %xmm12
7127; SSE-NEXT:    pand %xmm14, %xmm8
7128; SSE-NEXT:    por %xmm8, %xmm12
7129; SSE-NEXT:    movdqa %xmm12, (%rsp) # 16-byte Spill
7130; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7131; SSE-NEXT:    movdqa %xmm9, %xmm6
7132; SSE-NEXT:    psrlq $48, %xmm6
7133; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
7134; SSE-NEXT:    # xmm8 = mem[2,2,3,3]
7135; SSE-NEXT:    punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0]
7136; SSE-NEXT:    movdqa %xmm4, %xmm6
7137; SSE-NEXT:    pandn %xmm8, %xmm6
7138; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
7139; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7140; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
7141; SSE-NEXT:    pand %xmm4, %xmm1
7142; SSE-NEXT:    por %xmm6, %xmm1
7143; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7144; SSE-NEXT:    # xmm6 = mem[0,1,2,3,7,5,6,7]
7145; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
7146; SSE-NEXT:    movdqa %xmm14, %xmm8
7147; SSE-NEXT:    pandn %xmm6, %xmm8
7148; SSE-NEXT:    pand %xmm14, %xmm1
7149; SSE-NEXT:    por %xmm1, %xmm8
7150; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7151; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7152; SSE-NEXT:    movdqa %xmm15, %xmm1
7153; SSE-NEXT:    psrlq $48, %xmm1
7154; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7155; SSE-NEXT:    # xmm6 = mem[2,2,3,3]
7156; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0]
7157; SSE-NEXT:    movdqa %xmm4, %xmm1
7158; SSE-NEXT:    pandn %xmm6, %xmm1
7159; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
7160; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
7161; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
7162; SSE-NEXT:    pand %xmm4, %xmm6
7163; SSE-NEXT:    por %xmm1, %xmm6
7164; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7165; SSE-NEXT:    # xmm1 = mem[0,1,2,3,7,5,6,7]
7166; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7167; SSE-NEXT:    movdqa %xmm14, %xmm8
7168; SSE-NEXT:    pandn %xmm1, %xmm8
7169; SSE-NEXT:    pand %xmm14, %xmm6
7170; SSE-NEXT:    por %xmm6, %xmm8
7171; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7172; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7173; SSE-NEXT:    movdqa %xmm12, %xmm1
7174; SSE-NEXT:    psrlq $48, %xmm1
7175; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7176; SSE-NEXT:    # xmm6 = mem[2,2,3,3]
7177; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0]
7178; SSE-NEXT:    movdqa %xmm4, %xmm1
7179; SSE-NEXT:    pandn %xmm6, %xmm1
7180; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
7181; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
7182; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
7183; SSE-NEXT:    pand %xmm4, %xmm3
7184; SSE-NEXT:    por %xmm1, %xmm3
7185; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7186; SSE-NEXT:    # xmm1 = mem[0,1,2,3,7,5,6,7]
7187; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7188; SSE-NEXT:    movdqa %xmm14, %xmm6
7189; SSE-NEXT:    pandn %xmm1, %xmm6
7190; SSE-NEXT:    pand %xmm14, %xmm3
7191; SSE-NEXT:    por %xmm3, %xmm6
7192; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7193; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7194; SSE-NEXT:    movdqa %xmm8, %xmm1
7195; SSE-NEXT:    psrlq $48, %xmm1
7196; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7197; SSE-NEXT:    # xmm3 = mem[2,2,3,3]
7198; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
7199; SSE-NEXT:    movdqa %xmm4, %xmm1
7200; SSE-NEXT:    pandn %xmm3, %xmm1
7201; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7]
7202; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
7203; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
7204; SSE-NEXT:    pand %xmm4, %xmm3
7205; SSE-NEXT:    por %xmm1, %xmm3
7206; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7207; SSE-NEXT:    # xmm1 = mem[0,1,2,3,7,5,6,7]
7208; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7209; SSE-NEXT:    movdqa %xmm14, %xmm6
7210; SSE-NEXT:    pandn %xmm1, %xmm6
7211; SSE-NEXT:    pand %xmm14, %xmm3
7212; SSE-NEXT:    por %xmm3, %xmm6
7213; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7214; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7215; SSE-NEXT:    movdqa %xmm6, %xmm1
7216; SSE-NEXT:    psrlq $48, %xmm1
7217; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7218; SSE-NEXT:    # xmm3 = mem[2,2,3,3]
7219; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
7220; SSE-NEXT:    movdqa %xmm4, %xmm1
7221; SSE-NEXT:    pandn %xmm3, %xmm1
7222; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
7223; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
7224; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
7225; SSE-NEXT:    pand %xmm4, %xmm2
7226; SSE-NEXT:    por %xmm1, %xmm2
7227; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7228; SSE-NEXT:    # xmm1 = mem[0,1,2,3,7,5,6,7]
7229; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7230; SSE-NEXT:    movdqa %xmm14, %xmm3
7231; SSE-NEXT:    pandn %xmm1, %xmm3
7232; SSE-NEXT:    pand %xmm14, %xmm2
7233; SSE-NEXT:    por %xmm2, %xmm3
7234; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7235; SSE-NEXT:    psrlq $48, %xmm0
7236; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3]
7237; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
7238; SSE-NEXT:    movdqa %xmm4, %xmm1
7239; SSE-NEXT:    pandn %xmm2, %xmm1
7240; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7]
7241; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
7242; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
7243; SSE-NEXT:    pand %xmm4, %xmm2
7244; SSE-NEXT:    por %xmm1, %xmm2
7245; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7246; SSE-NEXT:    # xmm1 = mem[0,1,2,3,7,5,6,7]
7247; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7248; SSE-NEXT:    movdqa %xmm14, %xmm0
7249; SSE-NEXT:    pandn %xmm1, %xmm0
7250; SSE-NEXT:    pand %xmm14, %xmm2
7251; SSE-NEXT:    por %xmm2, %xmm0
7252; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7253; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7254; SSE-NEXT:    movdqa %xmm7, %xmm1
7255; SSE-NEXT:    psrlq $48, %xmm1
7256; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7257; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[2,2,3,3]
7258; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
7259; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7]
7260; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7261; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
7262; SSE-NEXT:    pand %xmm4, %xmm1
7263; SSE-NEXT:    pandn %xmm2, %xmm4
7264; SSE-NEXT:    por %xmm1, %xmm4
7265; SSE-NEXT:    pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7266; SSE-NEXT:    # xmm1 = mem[0,1,2,3,7,5,6,7]
7267; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7268; SSE-NEXT:    movdqa %xmm14, %xmm0
7269; SSE-NEXT:    pandn %xmm1, %xmm0
7270; SSE-NEXT:    pand %xmm14, %xmm4
7271; SSE-NEXT:    por %xmm4, %xmm0
7272; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7273; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7274; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7275; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7276; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
7277; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7278; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
7279; SSE-NEXT:    # xmm11 = mem[0,1,0,3]
7280; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,4,6]
7281; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7282; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1]
7283; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
7284; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7285; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm13[0,2,2,3,4,5,6,7]
7286; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
7287; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7288; SSE-NEXT:    movdqa %xmm14, %xmm4
7289; SSE-NEXT:    pandn %xmm3, %xmm4
7290; SSE-NEXT:    andps %xmm14, %xmm1
7291; SSE-NEXT:    por %xmm1, %xmm4
7292; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7293; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7294; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
7295; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7296; SSE-NEXT:    # xmm3 = mem[2,3,2,3]
7297; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7298; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7299; SSE-NEXT:    # xmm1 = mem[0,1,0,3]
7300; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6]
7301; SSE-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1]
7302; SSE-NEXT:    movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
7303; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7304; SSE-NEXT:    # xmm3 = mem[0,2,2,3,4,5,6,7]
7305; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
7306; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7307; SSE-NEXT:    movdqa %xmm14, %xmm9
7308; SSE-NEXT:    pandn %xmm3, %xmm9
7309; SSE-NEXT:    andps %xmm14, %xmm4
7310; SSE-NEXT:    por %xmm4, %xmm9
7311; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7312; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7313; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7314; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7315; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7316; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7317; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7318; SSE-NEXT:    # xmm3 = mem[0,1,0,3]
7319; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7320; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7321; SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm15[1]
7322; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7323; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7324; SSE-NEXT:    # xmm4 = mem[0,2,2,3,4,5,6,7]
7325; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7326; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7327; SSE-NEXT:    movdqa %xmm14, %xmm9
7328; SSE-NEXT:    pandn %xmm4, %xmm9
7329; SSE-NEXT:    andps %xmm14, %xmm3
7330; SSE-NEXT:    por %xmm3, %xmm9
7331; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7332; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7333; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7334; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7335; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7336; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7337; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7338; SSE-NEXT:    # xmm3 = mem[0,1,0,3]
7339; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7340; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7341; SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1]
7342; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7343; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7344; SSE-NEXT:    # xmm4 = mem[0,2,2,3,4,5,6,7]
7345; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7346; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7347; SSE-NEXT:    movdqa %xmm14, %xmm9
7348; SSE-NEXT:    pandn %xmm4, %xmm9
7349; SSE-NEXT:    andps %xmm14, %xmm3
7350; SSE-NEXT:    por %xmm3, %xmm9
7351; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7352; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7353; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7354; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7355; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7356; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7357; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7358; SSE-NEXT:    # xmm3 = mem[0,1,0,3]
7359; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7360; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7361; SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1]
7362; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7363; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7364; SSE-NEXT:    # xmm4 = mem[0,2,2,3,4,5,6,7]
7365; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7366; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7367; SSE-NEXT:    movdqa %xmm14, %xmm12
7368; SSE-NEXT:    pandn %xmm4, %xmm12
7369; SSE-NEXT:    andps %xmm14, %xmm3
7370; SSE-NEXT:    por %xmm3, %xmm12
7371; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7372; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7373; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7374; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7375; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7376; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7377; SSE-NEXT:    # xmm3 = mem[0,1,0,3]
7378; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7379; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7380; SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1]
7381; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7382; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7383; SSE-NEXT:    # xmm4 = mem[0,2,2,3,4,5,6,7]
7384; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7385; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7386; SSE-NEXT:    movdqa %xmm14, %xmm15
7387; SSE-NEXT:    pandn %xmm4, %xmm15
7388; SSE-NEXT:    andps %xmm14, %xmm3
7389; SSE-NEXT:    por %xmm3, %xmm15
7390; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7391; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7392; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7393; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7394; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7395; SSE-NEXT:    pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7396; SSE-NEXT:    # xmm3 = mem[0,1,0,3]
7397; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7398; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7399; SSE-NEXT:    punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7400; SSE-NEXT:    # xmm3 = xmm3[1],mem[1]
7401; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7402; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7403; SSE-NEXT:    # xmm4 = mem[0,2,2,3,4,5,6,7]
7404; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7405; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7406; SSE-NEXT:    movdqa %xmm14, %xmm8
7407; SSE-NEXT:    pandn %xmm4, %xmm8
7408; SSE-NEXT:    andps %xmm14, %xmm3
7409; SSE-NEXT:    por %xmm3, %xmm8
7410; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7411; SSE-NEXT:    # xmm3 = mem[1,1,1,1]
7412; SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7413; SSE-NEXT:    # xmm4 = mem[2,3,2,3]
7414; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7415; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[0,1,0,3]
7416; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7417; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7418; SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1]
7419; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7420; SSE-NEXT:    pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7421; SSE-NEXT:    # xmm4 = mem[0,2,2,3,4,5,6,7]
7422; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7423; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7424; SSE-NEXT:    movdqa %xmm14, %xmm7
7425; SSE-NEXT:    pandn %xmm4, %xmm7
7426; SSE-NEXT:    andps %xmm14, %xmm3
7427; SSE-NEXT:    por %xmm3, %xmm7
7428; SSE-NEXT:    psrlq $48, %xmm0
7429; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7430; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7431; SSE-NEXT:    movdqa %xmm2, %xmm3
7432; SSE-NEXT:    psrld $16, %xmm5
7433; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,7]
7434; SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1]
7435; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
7436; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7]
7437; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
7438; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
7439; SSE-NEXT:    movdqa %xmm14, %xmm6
7440; SSE-NEXT:    pandn %xmm3, %xmm6
7441; SSE-NEXT:    andps %xmm14, %xmm2
7442; SSE-NEXT:    por %xmm2, %xmm6
7443; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7444; SSE-NEXT:    psrlq $48, %xmm0
7445; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7446; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7447; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7448; SSE-NEXT:    movdqa %xmm2, %xmm3
7449; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7450; SSE-NEXT:    psrld $16, %xmm2
7451; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
7452; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
7453; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
7454; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7455; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
7456; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7457; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7458; SSE-NEXT:    movdqa %xmm14, %xmm5
7459; SSE-NEXT:    pandn %xmm2, %xmm5
7460; SSE-NEXT:    andps %xmm14, %xmm1
7461; SSE-NEXT:    por %xmm1, %xmm5
7462; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7463; SSE-NEXT:    psrlq $48, %xmm1
7464; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7465; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7466; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
7467; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7468; SSE-NEXT:    psrld $16, %xmm2
7469; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7470; SSE-NEXT:    # xmm1 = mem[0,1,2,3,4,5,5,7]
7471; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
7472; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
7473; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7474; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
7475; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7476; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7477; SSE-NEXT:    movdqa %xmm14, %xmm9
7478; SSE-NEXT:    pandn %xmm2, %xmm9
7479; SSE-NEXT:    andps %xmm14, %xmm1
7480; SSE-NEXT:    por %xmm1, %xmm9
7481; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7482; SSE-NEXT:    psrlq $48, %xmm0
7483; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7484; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7485; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7486; SSE-NEXT:    movdqa %xmm1, %xmm2
7487; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7488; SSE-NEXT:    psrld $16, %xmm0
7489; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7490; SSE-NEXT:    # xmm1 = mem[0,1,2,3,4,5,5,7]
7491; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
7492; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
7493; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7494; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
7495; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7496; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7497; SSE-NEXT:    movdqa %xmm14, %xmm11
7498; SSE-NEXT:    pandn %xmm2, %xmm11
7499; SSE-NEXT:    andps %xmm14, %xmm1
7500; SSE-NEXT:    por %xmm1, %xmm11
7501; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7502; SSE-NEXT:    psrlq $48, %xmm0
7503; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7504; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7505; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7506; SSE-NEXT:    movdqa %xmm1, %xmm2
7507; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7508; SSE-NEXT:    psrld $16, %xmm3
7509; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7510; SSE-NEXT:    # xmm1 = mem[0,1,2,3,4,5,5,7]
7511; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1]
7512; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
7513; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7514; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
7515; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7516; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7517; SSE-NEXT:    movdqa %xmm14, %xmm10
7518; SSE-NEXT:    pandn %xmm2, %xmm10
7519; SSE-NEXT:    andps %xmm14, %xmm1
7520; SSE-NEXT:    por %xmm1, %xmm10
7521; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7522; SSE-NEXT:    psrlq $48, %xmm0
7523; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7524; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7525; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7526; SSE-NEXT:    movdqa %xmm1, %xmm2
7527; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7528; SSE-NEXT:    psrld $16, %xmm3
7529; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7530; SSE-NEXT:    # xmm1 = mem[0,1,2,3,4,5,5,7]
7531; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1]
7532; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
7533; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7534; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
7535; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7536; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7537; SSE-NEXT:    movdqa %xmm14, %xmm4
7538; SSE-NEXT:    pandn %xmm2, %xmm4
7539; SSE-NEXT:    andps %xmm14, %xmm1
7540; SSE-NEXT:    por %xmm1, %xmm4
7541; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7542; SSE-NEXT:    psrlq $48, %xmm1
7543; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7544; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7545; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
7546; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7547; SSE-NEXT:    psrld $16, %xmm1
7548; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7549; SSE-NEXT:    # xmm2 = mem[0,1,2,3,4,5,5,7]
7550; SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
7551; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
7552; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7553; SSE-NEXT:    # xmm1 = mem[3,1,2,3,4,5,6,7]
7554; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
7555; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
7556; SSE-NEXT:    movdqa %xmm14, %xmm3
7557; SSE-NEXT:    pandn %xmm1, %xmm3
7558; SSE-NEXT:    andps %xmm14, %xmm2
7559; SSE-NEXT:    por %xmm2, %xmm3
7560; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7561; SSE-NEXT:    psrlq $48, %xmm0
7562; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7563; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7564; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7565; SSE-NEXT:    movdqa %xmm1, %xmm2
7566; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7567; SSE-NEXT:    psrld $16, %xmm0
7568; SSE-NEXT:    pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7569; SSE-NEXT:    # xmm1 = mem[0,1,2,3,4,5,5,7]
7570; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
7571; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
7572; SSE-NEXT:    andps %xmm14, %xmm1
7573; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7574; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
7575; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7576; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7577; SSE-NEXT:    pandn %xmm2, %xmm14
7578; SSE-NEXT:    por %xmm1, %xmm14
7579; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7580; SSE-NEXT:    movaps %xmm0, 96(%rsi)
7581; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7582; SSE-NEXT:    movaps %xmm0, 32(%rsi)
7583; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7584; SSE-NEXT:    movaps %xmm1, 112(%rsi)
7585; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7586; SSE-NEXT:    movaps %xmm1, 48(%rsi)
7587; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7588; SSE-NEXT:    movaps %xmm1, 64(%rsi)
7589; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7590; SSE-NEXT:    movaps %xmm1, (%rsi)
7591; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7592; SSE-NEXT:    movaps %xmm1, 80(%rsi)
7593; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7594; SSE-NEXT:    movaps %xmm1, 16(%rsi)
7595; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7596; SSE-NEXT:    movaps %xmm0, 96(%rdx)
7597; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7598; SSE-NEXT:    movaps %xmm0, 32(%rdx)
7599; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7600; SSE-NEXT:    movaps %xmm0, 112(%rdx)
7601; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7602; SSE-NEXT:    movaps %xmm0, 48(%rdx)
7603; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7604; SSE-NEXT:    movaps %xmm0, 64(%rdx)
7605; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7606; SSE-NEXT:    movaps %xmm0, (%rdx)
7607; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7608; SSE-NEXT:    movaps %xmm0, 80(%rdx)
7609; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7610; SSE-NEXT:    movaps %xmm0, 16(%rdx)
7611; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7612; SSE-NEXT:    movaps %xmm0, 96(%rcx)
7613; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7614; SSE-NEXT:    movaps %xmm0, 112(%rcx)
7615; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7616; SSE-NEXT:    movaps %xmm0, 64(%rcx)
7617; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7618; SSE-NEXT:    movaps %xmm0, 80(%rcx)
7619; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7620; SSE-NEXT:    movaps %xmm0, 32(%rcx)
7621; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7622; SSE-NEXT:    movaps %xmm0, 48(%rcx)
7623; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7624; SSE-NEXT:    movaps %xmm0, (%rcx)
7625; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7626; SSE-NEXT:    movaps %xmm0, 16(%rcx)
7627; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7628; SSE-NEXT:    movaps %xmm0, 112(%r8)
7629; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7630; SSE-NEXT:    movaps %xmm0, 96(%r8)
7631; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7632; SSE-NEXT:    movaps %xmm0, 80(%r8)
7633; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7634; SSE-NEXT:    movaps %xmm0, 64(%r8)
7635; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7636; SSE-NEXT:    movaps %xmm0, 48(%r8)
7637; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7638; SSE-NEXT:    movaps %xmm0, 32(%r8)
7639; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7640; SSE-NEXT:    movaps %xmm0, 16(%r8)
7641; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
7642; SSE-NEXT:    movaps %xmm0, (%r8)
7643; SSE-NEXT:    movdqa %xmm7, 112(%r9)
7644; SSE-NEXT:    movdqa %xmm8, 96(%r9)
7645; SSE-NEXT:    movdqa %xmm15, 80(%r9)
7646; SSE-NEXT:    movdqa %xmm12, 64(%r9)
7647; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7648; SSE-NEXT:    movaps %xmm0, 48(%r9)
7649; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7650; SSE-NEXT:    movaps %xmm0, 32(%r9)
7651; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7652; SSE-NEXT:    movaps %xmm0, 16(%r9)
7653; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7654; SSE-NEXT:    movaps %xmm0, (%r9)
7655; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7656; SSE-NEXT:    movdqa %xmm14, 112(%rax)
7657; SSE-NEXT:    movdqa %xmm3, 96(%rax)
7658; SSE-NEXT:    movdqa %xmm4, 80(%rax)
7659; SSE-NEXT:    movdqa %xmm10, 64(%rax)
7660; SSE-NEXT:    movdqa %xmm11, 48(%rax)
7661; SSE-NEXT:    movdqa %xmm9, 32(%rax)
7662; SSE-NEXT:    movdqa %xmm5, 16(%rax)
7663; SSE-NEXT:    movdqa %xmm6, (%rax)
7664; SSE-NEXT:    addq $1176, %rsp # imm = 0x498
7665; SSE-NEXT:    retq
7666;
7667; AVX-LABEL: load_i16_stride6_vf64:
7668; AVX:       # %bb.0:
7669; AVX-NEXT:    subq $1368, %rsp # imm = 0x558
7670; AVX-NEXT:    vmovdqa 96(%rdi), %xmm0
7671; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7672; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7673; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7674; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7675; AVX-NEXT:    vmovdqa 112(%rdi), %xmm1
7676; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7677; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
7678; AVX-NEXT:    vmovdqa 80(%rdi), %xmm1
7679; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7680; AVX-NEXT:    vpslld $16, %xmm1, %xmm1
7681; AVX-NEXT:    vmovdqa 64(%rdi), %xmm2
7682; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7683; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7684; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7685; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
7686; AVX-NEXT:    vmovdqa (%rdi), %xmm3
7687; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7688; AVX-NEXT:    vmovdqa 16(%rdi), %xmm4
7689; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7690; AVX-NEXT:    vmovdqa 32(%rdi), %xmm5
7691; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
7692; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7693; AVX-NEXT:    vpsrlq $16, %xmm5, %xmm1
7694; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7695; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
7696; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7697; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
7698; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
7699; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
7700; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7701; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
7702; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
7703; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
7704; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7]
7705; AVX-NEXT:    vmovdqa 176(%rdi), %xmm0
7706; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7707; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
7708; AVX-NEXT:    vmovdqa 160(%rdi), %xmm2
7709; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7710; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7711; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7712; AVX-NEXT:    vmovdqa 128(%rdi), %xmm2
7713; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7714; AVX-NEXT:    vpsrlq $16, %xmm2, %xmm2
7715; AVX-NEXT:    vmovdqa 144(%rdi), %xmm3
7716; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7717; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
7718; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7719; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
7720; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7721; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7]
7722; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
7723; AVX-NEXT:    vandps %ymm6, %ymm1, %ymm1
7724; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7725; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
7726; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm0
7727; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7728; AVX-NEXT:    vmovdqa 464(%rdi), %xmm0
7729; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
7730; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
7731; AVX-NEXT:    vmovdqa 448(%rdi), %xmm0
7732; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7733; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7734; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7735; AVX-NEXT:    vmovdqa 480(%rdi), %xmm0
7736; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7737; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7738; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7739; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
7740; AVX-NEXT:    vmovdqa 496(%rdi), %xmm0
7741; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7742; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7743; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
7744; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
7745; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7746; AVX-NEXT:    vpsrlq $16, %xmm0, %xmm2
7747; AVX-NEXT:    vmovdqa 432(%rdi), %xmm0
7748; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7749; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,3]
7750; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm13[0,1,0,2,4,5,6,7]
7751; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7752; AVX-NEXT:    vmovdqa 384(%rdi), %xmm0
7753; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7754; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,3]
7755; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7]
7756; AVX-NEXT:    vmovdqa 400(%rdi), %xmm0
7757; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7758; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
7759; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
7760; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
7761; AVX-NEXT:    vmovdqa 560(%rdi), %xmm0
7762; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7763; AVX-NEXT:    vpslld $16, %xmm0, %xmm2
7764; AVX-NEXT:    vmovdqa 544(%rdi), %xmm0
7765; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7766; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7767; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7768; AVX-NEXT:    vmovdqa 512(%rdi), %xmm0
7769; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7770; AVX-NEXT:    vpsrlq $16, %xmm0, %xmm3
7771; AVX-NEXT:    vmovdqa 528(%rdi), %xmm0
7772; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7773; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[0,3,2,3]
7774; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7]
7775; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
7776; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7]
7777; AVX-NEXT:    vandps %ymm6, %ymm1, %ymm1
7778; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7779; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
7780; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm0
7781; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7782; AVX-NEXT:    vmovdqa 272(%rdi), %xmm0
7783; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7784; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
7785; AVX-NEXT:    vmovdqa 256(%rdi), %xmm0
7786; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7787; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7788; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7789; AVX-NEXT:    vmovdqa 288(%rdi), %xmm0
7790; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7791; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7792; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7793; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
7794; AVX-NEXT:    vmovdqa 304(%rdi), %xmm0
7795; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7796; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7797; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
7798; AVX-NEXT:    vmovdqa 224(%rdi), %xmm0
7799; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7800; AVX-NEXT:    vpsrlq $16, %xmm0, %xmm2
7801; AVX-NEXT:    vmovdqa 240(%rdi), %xmm0
7802; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7803; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,3]
7804; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7]
7805; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7806; AVX-NEXT:    vmovdqa 192(%rdi), %xmm0
7807; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7808; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3]
7809; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7]
7810; AVX-NEXT:    vmovdqa 208(%rdi), %xmm0
7811; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7812; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
7813; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
7814; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
7815; AVX-NEXT:    vmovdqa 368(%rdi), %xmm0
7816; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7817; AVX-NEXT:    vpslld $16, %xmm0, %xmm2
7818; AVX-NEXT:    vmovdqa 352(%rdi), %xmm0
7819; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7820; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7821; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7822; AVX-NEXT:    vmovdqa 320(%rdi), %xmm10
7823; AVX-NEXT:    vpsrlq $16, %xmm10, %xmm3
7824; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7825; AVX-NEXT:    vmovdqa 336(%rdi), %xmm0
7826; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7827; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3]
7828; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7]
7829; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
7830; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7]
7831; AVX-NEXT:    vandps %ymm6, %ymm1, %ymm1
7832; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7833; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
7834; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm0
7835; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7836; AVX-NEXT:    vmovdqa 656(%rdi), %xmm0
7837; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7838; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
7839; AVX-NEXT:    vmovdqa 640(%rdi), %xmm0
7840; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7841; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7842; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7843; AVX-NEXT:    vmovdqa 672(%rdi), %xmm0
7844; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7845; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7846; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7847; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
7848; AVX-NEXT:    vmovdqa 688(%rdi), %xmm0
7849; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7850; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7851; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
7852; AVX-NEXT:    vmovdqa 608(%rdi), %xmm0
7853; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7854; AVX-NEXT:    vpsrlq $16, %xmm0, %xmm2
7855; AVX-NEXT:    vmovdqa 624(%rdi), %xmm0
7856; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7857; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
7858; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm3[0,1,0,2,4,5,6,7]
7859; AVX-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1]
7860; AVX-NEXT:    vmovdqa 576(%rdi), %xmm0
7861; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7862; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
7863; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7]
7864; AVX-NEXT:    vmovdqa 592(%rdi), %xmm2
7865; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7866; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7867; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4,5],xmm0[6,7]
7868; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3,4,5,6,7]
7869; AVX-NEXT:    vmovdqa 752(%rdi), %xmm0
7870; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7871; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
7872; AVX-NEXT:    vmovdqa 736(%rdi), %xmm0
7873; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7874; AVX-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7875; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
7876; AVX-NEXT:    vmovdqa 704(%rdi), %xmm0
7877; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7878; AVX-NEXT:    vpsrlq $16, %xmm0, %xmm0
7879; AVX-NEXT:    vmovdqa 720(%rdi), %xmm1
7880; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7881; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7882; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm1[0,1,0,2,4,5,6,7]
7883; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
7884; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm15[6,7]
7885; AVX-NEXT:    vandps %ymm6, %ymm2, %ymm2
7886; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
7887; AVX-NEXT:    vandnps %ymm0, %ymm6, %ymm0
7888; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
7889; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7890; AVX-NEXT:    vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7891; AVX-NEXT:    # xmm0 = mem[0,1,2,3,5,7,6,7]
7892; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7893; AVX-NEXT:    vpsrld $16, %xmm2, %xmm2
7894; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7895; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7896; AVX-NEXT:    # xmm2 = mem[2,2,3,3]
7897; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
7898; AVX-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7899; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
7900; AVX-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7901; AVX-NEXT:    # xmm2 = mem[0,1,1,3,4,5,6,7]
7902; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm5[1,1,1,1]
7903; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
7904; AVX-NEXT:    vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7905; AVX-NEXT:    # xmm14 = mem[0,1,2,3,5,7,6,7]
7906; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7907; AVX-NEXT:    vpsrld $16, %xmm5, %xmm15
7908; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
7909; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4,5],xmm14[6,7]
7910; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7911; AVX-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7912; AVX-NEXT:    # xmm2 = mem[0,1,1,3,4,5,6,7]
7913; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7914; AVX-NEXT:    # xmm14 = mem[1,1,1,1]
7915; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
7916; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7917; AVX-NEXT:    # xmm14 = mem[2,2,3,3]
7918; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
7919; AVX-NEXT:    # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3]
7920; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm14[6,7]
7921; AVX-NEXT:    vandps %ymm6, %ymm0, %ymm0
7922; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7923; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
7924; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
7925; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7926; AVX-NEXT:    vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7927; AVX-NEXT:    # xmm0 = mem[0,1,2,3,5,7,6,7]
7928; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7929; AVX-NEXT:    vpsrld $16, %xmm2, %xmm2
7930; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7931; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7932; AVX-NEXT:    # xmm2 = mem[2,2,3,3]
7933; AVX-NEXT:    vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload
7934; AVX-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7935; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
7936; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7]
7937; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
7938; AVX-NEXT:    # xmm13 = mem[1,1,1,1]
7939; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
7940; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7]
7941; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7942; AVX-NEXT:    vpsrld $16, %xmm5, %xmm13
7943; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
7944; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5],xmm12[6,7]
7945; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7946; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7]
7947; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
7948; AVX-NEXT:    # xmm11 = mem[1,1,1,1]
7949; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
7950; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
7951; AVX-NEXT:    # xmm11 = mem[2,2,3,3]
7952; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
7953; AVX-NEXT:    # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
7954; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm11[6,7]
7955; AVX-NEXT:    vmovaps %ymm6, %ymm13
7956; AVX-NEXT:    vandps %ymm6, %ymm0, %ymm0
7957; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7958; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
7959; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
7960; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7961; AVX-NEXT:    vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7962; AVX-NEXT:    # xmm0 = mem[0,1,2,3,5,7,6,7]
7963; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7964; AVX-NEXT:    vpsrld $16, %xmm2, %xmm2
7965; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7966; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7967; AVX-NEXT:    # xmm2 = mem[2,2,3,3]
7968; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
7969; AVX-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7970; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
7971; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
7972; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
7973; AVX-NEXT:    # xmm9 = mem[1,1,1,1]
7974; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
7975; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
7976; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7977; AVX-NEXT:    vpsrld $16, %xmm5, %xmm9
7978; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
7979; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4,5],xmm8[6,7]
7980; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7981; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
7982; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1]
7983; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
7984; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
7985; AVX-NEXT:    # xmm7 = mem[2,2,3,3]
7986; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
7987; AVX-NEXT:    # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
7988; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7]
7989; AVX-NEXT:    vandps %ymm6, %ymm0, %ymm0
7990; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7991; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
7992; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
7993; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7994; AVX-NEXT:    vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7995; AVX-NEXT:    # xmm0 = mem[0,1,2,3,5,7,6,7]
7996; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7997; AVX-NEXT:    vpsrld $16, %xmm12, %xmm2
7998; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7999; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8000; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3]
8001; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8002; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
8003; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
8004; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
8005; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8006; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
8007; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
8008; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7]
8009; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8010; AVX-NEXT:    vpsrld $16, %xmm5, %xmm4
8011; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8012; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
8013; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
8014; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
8015; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8016; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1]
8017; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8018; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8019; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3]
8020; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8021; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
8022; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
8023; AVX-NEXT:    vandps %ymm0, %ymm13, %ymm0
8024; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
8025; AVX-NEXT:    vandnps %ymm1, %ymm13, %ymm1
8026; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
8027; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8028; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8029; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
8030; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8031; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8032; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
8033; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15]
8034; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8035; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8036; AVX-NEXT:    # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
8037; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8038; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
8039; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
8040; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8041; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8042; AVX-NEXT:    # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
8043; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8044; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
8045; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
8046; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8047; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
8048; AVX-NEXT:    # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7]
8049; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8050; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
8051; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
8052; AVX-NEXT:    vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
8053; AVX-NEXT:    vandnps %ymm0, %ymm9, %ymm0
8054; AVX-NEXT:    vandps %ymm2, %ymm9, %ymm2
8055; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
8056; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8057; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
8058; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8059; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8060; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
8061; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8062; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
8063; AVX-NEXT:    # xmm3 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
8064; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8065; AVX-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
8066; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
8067; AVX-NEXT:    vandps %ymm0, %ymm13, %ymm0
8068; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
8069; AVX-NEXT:    vandnps %ymm2, %ymm13, %ymm2
8070; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
8071; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8072; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8073; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
8074; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8075; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8076; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
8077; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8078; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
8079; AVX-NEXT:    # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
8080; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8081; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
8082; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
8083; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8084; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload
8085; AVX-NEXT:    # xmm0 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
8086; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8087; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm2
8088; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8089; AVX-NEXT:    vpblendw $48, (%rsp), %xmm3, %xmm0 # 16-byte Folded Reload
8090; AVX-NEXT:    # xmm0 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
8091; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8092; AVX-NEXT:    vpshufb %xmm14, %xmm0, %xmm3
8093; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
8094; AVX-NEXT:    vandnps %ymm1, %ymm9, %ymm0
8095; AVX-NEXT:    vandps %ymm2, %ymm9, %ymm2
8096; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
8097; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8098; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
8099; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8100; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8101; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
8102; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8103; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
8104; AVX-NEXT:    # xmm1 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
8105; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8106; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm3
8107; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
8108; AVX-NEXT:    vandps %ymm0, %ymm13, %ymm0
8109; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
8110; AVX-NEXT:    vandnps %ymm2, %ymm13, %ymm2
8111; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
8112; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8113; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8114; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
8115; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8116; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
8117; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload
8118; AVX-NEXT:    # xmm2 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7]
8119; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8120; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
8121; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
8122; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload
8123; AVX-NEXT:    # xmm2 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
8124; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8125; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
8126; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7]
8127; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8128; AVX-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
8129; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
8130; AVX-NEXT:    vandnps %ymm0, %ymm9, %ymm0
8131; AVX-NEXT:    vandps %ymm2, %ymm9, %ymm2
8132; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm2
8133; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8134; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
8135; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8136; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
8137; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7]
8138; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8139; AVX-NEXT:    vpshufb %xmm14, %xmm0, %xmm4
8140; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
8141; AVX-NEXT:    vandps %ymm2, %ymm13, %ymm2
8142; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
8143; AVX-NEXT:    vandnps %ymm3, %ymm13, %ymm3
8144; AVX-NEXT:    vorps %ymm3, %ymm2, %ymm0
8145; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8146; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8147; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
8148; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8149; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8150; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
8151; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8152; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8153; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
8154; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8155; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm3
8156; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
8157; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8158; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
8159; AVX-NEXT:    # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
8160; AVX-NEXT:    vpshufb %xmm15, %xmm3, %xmm0
8161; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8162; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload
8163; AVX-NEXT:    # xmm15 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
8164; AVX-NEXT:    vpshufb %xmm14, %xmm15, %xmm6
8165; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm6, %ymm0
8166; AVX-NEXT:    vandnps %ymm2, %ymm9, %ymm2
8167; AVX-NEXT:    vandps %ymm0, %ymm9, %ymm0
8168; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
8169; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8170; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1]
8171; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8172; AVX-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8173; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
8174; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8175; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
8176; AVX-NEXT:    # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7]
8177; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm14
8178; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7]
8179; AVX-NEXT:    vandps %ymm0, %ymm13, %ymm0
8180; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
8181; AVX-NEXT:    vandnps %ymm6, %ymm13, %ymm6
8182; AVX-NEXT:    vmovaps %ymm13, %ymm5
8183; AVX-NEXT:    vorps %ymm6, %ymm0, %ymm0
8184; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8185; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8186; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
8187; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8188; AVX-NEXT:    # xmm6 = mem[2,2,3,3]
8189; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
8190; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15]
8191; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8192; AVX-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
8193; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7]
8194; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8195; AVX-NEXT:    vpshufb %xmm14, %xmm6, %xmm7
8196; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
8197; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8198; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
8199; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
8200; AVX-NEXT:    vandnps %ymm0, %ymm9, %ymm0
8201; AVX-NEXT:    vandps %ymm7, %ymm9, %ymm7
8202; AVX-NEXT:    vorps %ymm0, %ymm7, %ymm0
8203; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8204; AVX-NEXT:    vpsrlq $48, %xmm7, %xmm7
8205; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8206; AVX-NEXT:    # xmm8 = mem[2,2,3,3]
8207; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0]
8208; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8209; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
8210; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
8211; AVX-NEXT:    vandps %ymm0, %ymm13, %ymm0
8212; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
8213; AVX-NEXT:    vandnps %ymm7, %ymm13, %ymm7
8214; AVX-NEXT:    vorps %ymm7, %ymm0, %ymm0
8215; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8216; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8217; AVX-NEXT:    vpsrlq $48, %xmm13, %xmm0
8218; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8219; AVX-NEXT:    # xmm7 = mem[2,2,3,3]
8220; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
8221; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8222; AVX-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
8223; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7]
8224; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8225; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm7
8226; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8227; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm8
8228; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
8229; AVX-NEXT:    vandnps %ymm0, %ymm9, %ymm0
8230; AVX-NEXT:    vandps %ymm7, %ymm9, %ymm7
8231; AVX-NEXT:    vorps %ymm0, %ymm7, %ymm0
8232; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8233; AVX-NEXT:    vpsrlq $48, %xmm10, %xmm7
8234; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8235; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3]
8236; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0]
8237; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8238; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm8
8239; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
8240; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
8241; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
8242; AVX-NEXT:    vandnps %ymm7, %ymm5, %ymm7
8243; AVX-NEXT:    vorps %ymm7, %ymm0, %ymm0
8244; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8245; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8246; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm0
8247; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3]
8248; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
8249; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8250; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm4
8251; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7]
8252; AVX-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
8253; AVX-NEXT:    vpshufb %xmm6, %xmm15, %xmm4
8254; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
8255; AVX-NEXT:    vandnps %ymm0, %ymm9, %ymm0
8256; AVX-NEXT:    vandps %ymm3, %ymm9, %ymm3
8257; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm0
8258; AVX-NEXT:    vpsrlq $48, %xmm11, %xmm3
8259; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8260; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm15[2,2,3,3]
8261; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
8262; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
8263; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
8264; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
8265; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
8266; AVX-NEXT:    vandnps %ymm2, %ymm5, %ymm2
8267; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
8268; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8269; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8270; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
8271; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8272; AVX-NEXT:    # xmm2 = mem[2,2,3,3]
8273; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
8274; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8275; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
8276; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
8277; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8278; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
8279; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8280; AVX-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
8281; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
8282; AVX-NEXT:    vandnps %ymm0, %ymm9, %ymm0
8283; AVX-NEXT:    vandps %ymm2, %ymm9, %ymm1
8284; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
8285; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8286; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm1
8287; AVX-NEXT:    vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8288; AVX-NEXT:    # xmm2 = mem[2,2,3,3]
8289; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
8290; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8291; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
8292; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
8293; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
8294; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
8295; AVX-NEXT:    vandnps %ymm1, %ymm5, %ymm1
8296; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
8297; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8298; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8299; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
8300; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8301; AVX-NEXT:    # xmm1 = mem[2,3,2,3]
8302; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8303; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8304; AVX-NEXT:    vpblendw $243, (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload
8305; AVX-NEXT:    # xmm2 = mem[0,1],xmm1[2,3],mem[4,5,6,7]
8306; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
8307; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
8308; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
8309; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
8310; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8311; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
8312; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8313; AVX-NEXT:    # xmm3 = mem[2,3,2,3]
8314; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8315; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8316; AVX-NEXT:    # xmm3 = mem[0,1,0,3]
8317; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8318; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
8319; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm13[1]
8320; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
8321; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
8322; AVX-NEXT:    vandnps %ymm0, %ymm5, %ymm0
8323; AVX-NEXT:    vandps %ymm5, %ymm2, %ymm2
8324; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
8325; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8326; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload
8327; AVX-NEXT:    # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
8328; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8329; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,3]
8330; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8331; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
8332; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1]
8333; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
8334; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
8335; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
8336; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
8337; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8338; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8339; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
8340; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8341; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
8342; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8343; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8344; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload
8345; AVX-NEXT:    # xmm13 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
8346; AVX-NEXT:    vpshufb %xmm1, %xmm13, %xmm2
8347; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
8348; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8349; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1]
8350; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8351; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3]
8352; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8353; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8354; AVX-NEXT:    # xmm3 = mem[0,1,0,3]
8355; AVX-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,4,6]
8356; AVX-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
8357; AVX-NEXT:    # xmm6 = xmm6[1],mem[1]
8358; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7]
8359; AVX-NEXT:    vandnps %ymm0, %ymm5, %ymm0
8360; AVX-NEXT:    vandps %ymm5, %ymm2, %ymm2
8361; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
8362; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8363; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload
8364; AVX-NEXT:    # xmm9 = mem[0,1],xmm2[2,3],mem[4,5,6,7]
8365; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8366; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8367; AVX-NEXT:    # xmm6 = mem[0,1,0,3]
8368; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6]
8369; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8370; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm2[1]
8371; AVX-NEXT:    vpshufb %xmm1, %xmm9, %xmm10
8372; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7]
8373; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
8374; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7]
8375; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8376; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8377; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
8378; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8379; AVX-NEXT:    # xmm7 = mem[2,3,2,3]
8380; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
8381; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8382; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
8383; AVX-NEXT:    # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
8384; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8385; AVX-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
8386; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
8387; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8388; AVX-NEXT:    # xmm7 = mem[1,1,1,1]
8389; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
8390; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
8391; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
8392; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8393; AVX-NEXT:    # xmm9 = mem[0,1,0,3]
8394; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8395; AVX-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,4,6]
8396; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm8[1]
8397; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7]
8398; AVX-NEXT:    vandnps %ymm0, %ymm5, %ymm0
8399; AVX-NEXT:    vandps %ymm5, %ymm7, %ymm7
8400; AVX-NEXT:    vorps %ymm0, %ymm7, %ymm0
8401; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8402; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload
8403; AVX-NEXT:    # xmm8 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
8404; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8405; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm15[0,1,0,3]
8406; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8407; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6]
8408; AVX-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
8409; AVX-NEXT:    # xmm7 = xmm7[1],mem[1]
8410; AVX-NEXT:    vpshufb %xmm1, %xmm8, %xmm14
8411; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7]
8412; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
8413; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7]
8414; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8415; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8416; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
8417; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8418; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm10[2,3,2,3]
8419; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
8420; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8421; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
8422; AVX-NEXT:    # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
8423; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8424; AVX-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
8425; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
8426; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8427; AVX-NEXT:    # xmm7 = mem[1,1,1,1]
8428; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8429; AVX-NEXT:    # xmm15 = mem[2,3,2,3]
8430; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3]
8431; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8432; AVX-NEXT:    # xmm15 = mem[0,1,0,3]
8433; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,5,4,6]
8434; AVX-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
8435; AVX-NEXT:    # xmm9 = xmm9[1],mem[1]
8436; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7]
8437; AVX-NEXT:    vandnps %ymm0, %ymm5, %ymm0
8438; AVX-NEXT:    vandps %ymm5, %ymm7, %ymm7
8439; AVX-NEXT:    vorps %ymm0, %ymm7, %ymm0
8440; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8441; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
8442; AVX-NEXT:    # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
8443; AVX-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
8444; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8445; AVX-NEXT:    # xmm14 = mem[0,1,0,3]
8446; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,4,6]
8447; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8448; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm1[1]
8449; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
8450; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
8451; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
8452; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8453; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8454; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
8455; AVX-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8456; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
8457; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
8458; AVX-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
8459; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm4, %ymm4
8460; AVX-NEXT:    vpsrlq $48, %xmm12, %xmm8
8461; AVX-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8462; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8463; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8464; AVX-NEXT:    vpsrld $16, %xmm9, %xmm9
8465; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7]
8466; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1]
8467; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7]
8468; AVX-NEXT:    vandnps %ymm4, %ymm5, %ymm4
8469; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
8470; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
8471; AVX-NEXT:    vpsrld $16, %xmm2, %xmm4
8472; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7]
8473; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1]
8474; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8475; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
8476; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
8477; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
8478; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5,6,7]
8479; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8480; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm2
8481; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8482; AVX-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8483; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
8484; AVX-NEXT:    vmovdqa (%rsp), %xmm4 # 16-byte Reload
8485; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
8486; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
8487; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8488; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm4
8489; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8490; AVX-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8491; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
8492; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8493; AVX-NEXT:    vpsrld $16, %xmm6, %xmm6
8494; AVX-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8495; AVX-NEXT:    # xmm8 = mem[0,1,2,3,4,5,5,7]
8496; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1]
8497; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3,4,5,6,7]
8498; AVX-NEXT:    vandnps %ymm2, %ymm5, %ymm2
8499; AVX-NEXT:    vandps %ymm5, %ymm4, %ymm4
8500; AVX-NEXT:    vorps %ymm2, %ymm4, %ymm2
8501; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8502; AVX-NEXT:    vpsrld $16, %xmm4, %xmm4
8503; AVX-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8504; AVX-NEXT:    # xmm6 = mem[0,1,2,3,4,5,5,7]
8505; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1]
8506; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8507; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
8508; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
8509; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
8510; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
8511; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8512; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm4
8513; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8514; AVX-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8515; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
8516; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8517; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
8518; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
8519; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8520; AVX-NEXT:    vpsrlq $48, %xmm6, %xmm6
8521; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8522; AVX-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8523; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
8524; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8525; AVX-NEXT:    vpsrld $16, %xmm8, %xmm8
8526; AVX-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8527; AVX-NEXT:    # xmm9 = mem[0,1,2,3,4,5,5,7]
8528; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1]
8529; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3,4,5,6,7]
8530; AVX-NEXT:    vandnps %ymm4, %ymm5, %ymm4
8531; AVX-NEXT:    vandps %ymm5, %ymm6, %ymm6
8532; AVX-NEXT:    vorps %ymm4, %ymm6, %ymm4
8533; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8534; AVX-NEXT:    vpsrld $16, %xmm6, %xmm6
8535; AVX-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8536; AVX-NEXT:    # xmm8 = mem[0,1,2,3,4,5,5,7]
8537; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1]
8538; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8539; AVX-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
8540; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
8541; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
8542; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7]
8543; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8544; AVX-NEXT:    vpsrlq $48, %xmm6, %xmm6
8545; AVX-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8546; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
8547; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8548; AVX-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
8549; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm6
8550; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8551; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm8
8552; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8553; AVX-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8554; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8555; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8556; AVX-NEXT:    vpsrld $16, %xmm9, %xmm9
8557; AVX-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,5,5,7]
8558; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1]
8559; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7]
8560; AVX-NEXT:    vandnps %ymm6, %ymm5, %ymm6
8561; AVX-NEXT:    vandps %ymm5, %ymm8, %ymm5
8562; AVX-NEXT:    vorps %ymm6, %ymm5, %ymm5
8563; AVX-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
8564; AVX-NEXT:    vpsrld $16, %xmm1, %xmm6
8565; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7]
8566; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
8567; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
8568; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
8569; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7]
8570; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8571; AVX-NEXT:    vmovaps %ymm1, 96(%rsi)
8572; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8573; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
8574; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8575; AVX-NEXT:    vmovaps %ymm1, 64(%rsi)
8576; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8577; AVX-NEXT:    vmovaps %ymm1, (%rsi)
8578; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8579; AVX-NEXT:    vmovaps %ymm1, 96(%rdx)
8580; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8581; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
8582; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8583; AVX-NEXT:    vmovaps %ymm1, 64(%rdx)
8584; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8585; AVX-NEXT:    vmovaps %ymm1, (%rdx)
8586; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8587; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
8588; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8589; AVX-NEXT:    vmovaps %ymm1, 96(%rcx)
8590; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8591; AVX-NEXT:    vmovaps %ymm1, 64(%rcx)
8592; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8593; AVX-NEXT:    vmovaps %ymm1, (%rcx)
8594; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8595; AVX-NEXT:    vmovaps %ymm1, 96(%r8)
8596; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8597; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
8598; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8599; AVX-NEXT:    vmovaps %ymm1, 64(%r8)
8600; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8601; AVX-NEXT:    vmovaps %ymm1, (%r8)
8602; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8603; AVX-NEXT:    vmovaps %ymm1, 96(%r9)
8604; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8605; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
8606; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8607; AVX-NEXT:    vmovaps %ymm1, (%r9)
8608; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8609; AVX-NEXT:    vmovaps %ymm1, 64(%r9)
8610; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8611; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
8612; AVX-NEXT:    vmovaps %ymm4, 32(%rax)
8613; AVX-NEXT:    vmovaps %ymm2, 64(%rax)
8614; AVX-NEXT:    vmovaps %ymm3, (%rax)
8615; AVX-NEXT:    addq $1368, %rsp # imm = 0x558
8616; AVX-NEXT:    vzeroupper
8617; AVX-NEXT:    retq
8618;
8619; AVX2-LABEL: load_i16_stride6_vf64:
8620; AVX2:       # %bb.0:
8621; AVX2-NEXT:    subq $1272, %rsp # imm = 0x4F8
8622; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
8623; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
8624; AVX2-NEXT:    vmovaps 672(%rdi), %ymm2
8625; AVX2-NEXT:    vmovaps 640(%rdi), %ymm3
8626; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm4
8627; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm5
8628; AVX2-NEXT:    vmovdqa 416(%rdi), %ymm8
8629; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8630; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm9
8631; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8632; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm6
8633; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm7
8634; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
8635; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8636; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
8637; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8638; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
8639; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8640; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
8641; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
8642; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8643; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
8644; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8645; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3]
8646; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8647; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
8648; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8649; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
8650; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
8651; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
8652; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
8653; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
8654; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7]
8655; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
8656; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
8657; AVX2-NEXT:    vpshufb %ymm2, %ymm4, %ymm7
8658; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
8659; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
8660; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8661; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
8662; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8663; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm7
8664; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8665; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
8666; AVX2-NEXT:    vpshufb %xmm6, %xmm9, %xmm3
8667; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm11
8668; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm11[2,2,2,2,4,5,6,7]
8669; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7]
8670; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
8671; AVX2-NEXT:    vpshufb %ymm2, %ymm10, %ymm7
8672; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
8673; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8674; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
8675; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8676; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
8677; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8678; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
8679; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
8680; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm8
8681; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7]
8682; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7]
8683; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8684; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
8685; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm13
8686; AVX2-NEXT:    vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
8687; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8688; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm13
8689; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8690; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm12
8691; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8692; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
8693; AVX2-NEXT:    vpshufb %xmm6, %xmm12, %xmm13
8694; AVX2-NEXT:    vextracti128 $1, %ymm12, %xmm6
8695; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7]
8696; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
8697; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8698; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8699; AVX2-NEXT:    # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
8700; AVX2-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
8701; AVX2-NEXT:    vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
8702; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8703; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
8704; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
8705; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3]
8706; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
8707; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
8708; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
8709; AVX2-NEXT:    vpshufb %ymm9, %ymm10, %ymm10
8710; AVX2-NEXT:    vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
8711; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8712; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
8713; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
8714; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
8715; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
8716; AVX2-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
8717; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
8718; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8719; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm1
8720; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3]
8721; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
8722; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
8723; AVX2-NEXT:    vpshufb %ymm9, %ymm7, %ymm3
8724; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm1
8725; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8726; AVX2-NEXT:    vpshufb %ymm9, %ymm13, %ymm1
8727; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
8728; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
8729; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
8730; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
8731; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
8732; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8733; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8734; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
8735; AVX2-NEXT:    # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
8736; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm0
8737; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8738; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
8739; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
8740; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
8741; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
8742; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
8743; AVX2-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
8744; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8745; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8746; AVX2-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
8747; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8748; AVX2-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
8749; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
8750; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm1
8751; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8752; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm2
8753; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8754; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
8755; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7]
8756; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm2
8757; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8758; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
8759; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
8760; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
8761; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
8762; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
8763; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
8764; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8765; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8766; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
8767; AVX2-NEXT:    # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
8768; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm0
8769; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8770; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
8771; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
8772; AVX2-NEXT:    vpshufb %xmm10, %xmm6, %xmm2
8773; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
8774; AVX2-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
8775; AVX2-NEXT:    # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
8776; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8777; AVX2-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
8778; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
8779; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm2
8780; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8781; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm3
8782; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8783; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
8784; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
8785; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm3
8786; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8787; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
8788; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
8789; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
8790; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
8791; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
8792; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8793; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8794; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
8795; AVX2-NEXT:    # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
8796; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm14
8797; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
8798; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
8799; AVX2-NEXT:    vpshufb %xmm10, %xmm3, %xmm2
8800; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
8801; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8802; AVX2-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
8803; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
8804; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8805; AVX2-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
8806; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
8807; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm8
8808; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8809; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm2
8810; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8811; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4],ymm8[5],ymm2[6,7]
8812; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[2,2,2,2,4,5,6,7]
8813; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm12
8814; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3,4],xmm12[5,6,7]
8815; AVX2-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
8816; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8817; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
8818; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
8819; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8820; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm8
8821; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8822; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm0
8823; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8824; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
8825; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7]
8826; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm11
8827; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3,4],xmm11[5,6,7]
8828; AVX2-NEXT:    vpshufb %xmm1, %xmm8, %xmm9
8829; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8830; AVX2-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8831; AVX2-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
8832; AVX2-NEXT:    vpshufb %xmm10, %xmm1, %xmm8
8833; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm10
8834; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm10[0,2,0,3]
8835; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7]
8836; AVX2-NEXT:    vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm15[2],xmm8[3],xmm15[4,5],xmm8[6,7]
8837; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
8838; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
8839; AVX2-NEXT:    # ymm8 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7]
8840; AVX2-NEXT:    vpshufb %ymm13, %ymm8, %ymm13
8841; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
8842; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
8843; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7],ymm13[8,9,10],ymm9[11,12,13,14,15]
8844; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
8845; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8846; AVX2-NEXT:    vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8847; AVX2-NEXT:    # xmm9 = mem[2,1,0,3]
8848; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
8849; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
8850; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
8851; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm9[2],xmm5[3,4],xmm9[5],xmm5[6],xmm9[7]
8852; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
8853; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8854; AVX2-NEXT:    vpshufb %ymm9, %ymm13, %ymm13
8855; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
8856; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
8857; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7]
8858; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
8859; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8860; AVX2-NEXT:    vpshufb %xmm5, %xmm15, %xmm15
8861; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
8862; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3],xmm15[4,5],xmm7[6],xmm15[7]
8863; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
8864; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3,4,5,6,7],ymm13[8,9,10],ymm7[11,12,13,14,15]
8865; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
8866; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8867; AVX2-NEXT:    vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8868; AVX2-NEXT:    # xmm7 = mem[2,1,0,3]
8869; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
8870; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
8871; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
8872; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6],xmm7[7]
8873; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
8874; AVX2-NEXT:    vpshufb %ymm9, %ymm7, %ymm7
8875; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7]
8876; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
8877; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
8878; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8879; AVX2-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
8880; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
8881; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
8882; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
8883; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
8884; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
8885; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8886; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[2,1,0,3]
8887; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
8888; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
8889; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
8890; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6],xmm4[7]
8891; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8892; AVX2-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
8893; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7]
8894; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
8895; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
8896; AVX2-NEXT:    vpshufb %xmm5, %xmm12, %xmm4
8897; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
8898; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
8899; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
8900; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
8901; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
8902; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8903; AVX2-NEXT:    vpshufb %xmm5, %xmm11, %xmm2
8904; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
8905; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
8906; AVX2-NEXT:    vpshufb %ymm9, %ymm8, %ymm2
8907; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[2,1,0,3]
8908; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
8909; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
8910; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
8911; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6],xmm3[7]
8912; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7]
8913; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
8914; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
8915; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8916; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
8917; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
8918; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8919; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8920; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
8921; AVX2-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
8922; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8923; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8924; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8925; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
8926; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
8927; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
8928; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8929; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
8930; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8931; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
8932; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
8933; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4]
8934; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
8935; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8936; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8937; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
8938; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
8939; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
8940; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
8941; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,3,4,5,6,7]
8942; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
8943; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
8944; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
8945; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8946; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
8947; AVX2-NEXT:    vpshufb %ymm8, %ymm3, %ymm2
8948; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
8949; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
8950; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
8951; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
8952; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8953; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8954; AVX2-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
8955; AVX2-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
8956; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8957; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8958; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8959; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
8960; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
8961; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
8962; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8963; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
8964; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8965; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
8966; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
8967; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4]
8968; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
8969; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8970; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8971; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
8972; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
8973; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8974; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
8975; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
8976; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8977; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
8978; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
8979; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7]
8980; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
8981; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8982; AVX2-NEXT:    vpshufb %ymm8, %ymm3, %ymm2
8983; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
8984; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
8985; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
8986; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
8987; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8988; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8989; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
8990; AVX2-NEXT:    # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
8991; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8992; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8993; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
8994; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
8995; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,1]
8996; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
8997; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
8998; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
8999; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,4]
9000; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
9001; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9002; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9003; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
9004; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
9005; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
9006; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1]
9007; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7]
9008; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
9009; AVX2-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7]
9010; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7]
9011; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
9012; AVX2-NEXT:    vpshufb %ymm8, %ymm9, %ymm13
9013; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15]
9014; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4]
9015; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
9016; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7]
9017; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9018; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9019; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
9020; AVX2-NEXT:    # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9021; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm12
9022; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
9023; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3]
9024; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm14[0,0,0,0,4,5,6,7]
9025; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
9026; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,6,4]
9027; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7]
9028; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9029; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
9030; AVX2-NEXT:    # ymm15 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7]
9031; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9032; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
9033; AVX2-NEXT:    # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
9034; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm4[2,1,2,3]
9035; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
9036; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
9037; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7]
9038; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3]
9039; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[2,1,2,0,4,5,6,7]
9040; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7]
9041; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9042; AVX2-NEXT:    vpshufb %ymm8, %ymm15, %ymm8
9043; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
9044; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4]
9045; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
9046; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
9047; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9048; AVX2-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9049; AVX2-NEXT:    # xmm0 = mem[0,1,2,3,7,5,6,5]
9050; AVX2-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9051; AVX2-NEXT:    # xmm8 = mem[1,1,1,1,4,5,6,7]
9052; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
9053; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7]
9054; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
9055; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
9056; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
9057; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
9058; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
9059; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
9060; AVX2-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
9061; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9062; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
9063; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
9064; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
9065; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
9066; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9067; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,5,6,5]
9068; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
9069; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
9070; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7]
9071; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
9072; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
9073; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
9074; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
9075; AVX2-NEXT:    vpshufb %ymm7, %ymm9, %ymm2
9076; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9077; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
9078; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
9079; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9080; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9081; AVX2-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9082; AVX2-NEXT:    # xmm0 = mem[0,1,2,3,7,5,6,5]
9083; AVX2-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9084; AVX2-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
9085; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
9086; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
9087; AVX2-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9088; AVX2-NEXT:    # xmm1 = mem[3,1,2,1,4,5,6,7]
9089; AVX2-NEXT:    vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9090; AVX2-NEXT:    # xmm2 = mem[0,1,3,3,4,5,6,7]
9091; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
9092; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
9093; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9094; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
9095; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9096; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
9097; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
9098; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9099; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9100; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,5,6,5]
9101; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7]
9102; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
9103; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
9104; AVX2-NEXT:    vpshufb %ymm7, %ymm15, %ymm1
9105; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[3,1,2,1,4,5,6,7]
9106; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,3,4,5,6,7]
9107; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
9108; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
9109; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9110; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
9111; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
9112; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
9113; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9114; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9115; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9116; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9117; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
9118; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
9119; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
9120; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
9121; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
9122; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm3
9123; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7]
9124; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9125; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9126; AVX2-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7]
9127; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9128; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9129; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9130; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm11
9131; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,3,2,1]
9132; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7]
9133; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
9134; AVX2-NEXT:    vpshufb %xmm9, %xmm11, %xmm4
9135; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7]
9136; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9137; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
9138; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7]
9139; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9140; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9141; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9142; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm13
9143; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[0,3,2,1]
9144; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7]
9145; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
9146; AVX2-NEXT:    vpshufb %xmm9, %xmm13, %xmm10
9147; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6,7]
9148; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9149; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9150; AVX2-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7]
9151; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9152; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
9153; AVX2-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9154; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm15
9155; AVX2-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
9156; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
9157; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7]
9158; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
9159; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7]
9160; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
9161; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
9162; AVX2-NEXT:    # ymm10 = mem[0,1,2,3,4],ymm9[5,6,7]
9163; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
9164; AVX2-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
9165; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
9166; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
9167; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
9168; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
9169; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
9170; AVX2-NEXT:    # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
9171; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
9172; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
9173; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
9174; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
9175; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9176; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
9177; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
9178; AVX2-NEXT:    vpshufb %xmm9, %xmm13, %xmm7
9179; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7]
9180; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
9181; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
9182; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
9183; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
9184; AVX2-NEXT:    # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
9185; AVX2-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
9186; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
9187; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
9188; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
9189; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9190; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9191; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
9192; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9193; AVX2-NEXT:    vmovaps %ymm9, 96(%rsi)
9194; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9195; AVX2-NEXT:    vmovaps %ymm9, 32(%rsi)
9196; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9197; AVX2-NEXT:    vmovaps %ymm9, 64(%rsi)
9198; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9199; AVX2-NEXT:    vmovaps %ymm9, (%rsi)
9200; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9201; AVX2-NEXT:    vmovaps %ymm9, 96(%rdx)
9202; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9203; AVX2-NEXT:    vmovaps %ymm9, 32(%rdx)
9204; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9205; AVX2-NEXT:    vmovaps %ymm9, 64(%rdx)
9206; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9207; AVX2-NEXT:    vmovaps %ymm9, (%rdx)
9208; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9209; AVX2-NEXT:    vmovaps %ymm9, 32(%rcx)
9210; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9211; AVX2-NEXT:    vmovaps %ymm9, 96(%rcx)
9212; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9213; AVX2-NEXT:    vmovaps %ymm9, 64(%rcx)
9214; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9215; AVX2-NEXT:    vmovaps %ymm9, (%rcx)
9216; AVX2-NEXT:    vmovdqa %ymm6, 96(%r8)
9217; AVX2-NEXT:    vmovdqa %ymm8, 32(%r8)
9218; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
9219; AVX2-NEXT:    vmovaps %ymm6, 64(%r8)
9220; AVX2-NEXT:    vmovdqa %ymm5, (%r8)
9221; AVX2-NEXT:    vmovdqa %ymm10, 96(%r9)
9222; AVX2-NEXT:    vmovdqa %ymm2, 32(%r9)
9223; AVX2-NEXT:    vmovdqa %ymm4, (%r9)
9224; AVX2-NEXT:    vmovdqa %ymm3, 64(%r9)
9225; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9226; AVX2-NEXT:    vmovdqa %ymm0, 96(%rax)
9227; AVX2-NEXT:    vmovdqa %ymm7, 32(%rax)
9228; AVX2-NEXT:    vmovdqa %ymm1, 64(%rax)
9229; AVX2-NEXT:    vmovdqa %ymm11, (%rax)
9230; AVX2-NEXT:    addq $1272, %rsp # imm = 0x4F8
9231; AVX2-NEXT:    vzeroupper
9232; AVX2-NEXT:    retq
9233;
9234; AVX2-FP-LABEL: load_i16_stride6_vf64:
9235; AVX2-FP:       # %bb.0:
9236; AVX2-FP-NEXT:    subq $1304, %rsp # imm = 0x518
9237; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
9238; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
9239; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm2
9240; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm3
9241; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm4
9242; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm5
9243; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %ymm8
9244; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9245; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm9
9246; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9247; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm6
9248; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm7
9249; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
9250; AVX2-FP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
9251; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
9252; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9253; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
9254; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9255; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
9256; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
9257; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9258; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
9259; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9260; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3]
9261; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9262; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
9263; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9264; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
9265; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
9266; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
9267; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
9268; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm5
9269; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7]
9270; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
9271; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
9272; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm4, %ymm7
9273; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
9274; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
9275; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9276; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm6
9277; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9278; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm7
9279; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9280; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
9281; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
9282; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm11
9283; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7]
9284; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
9285; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
9286; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm10, %ymm8
9287; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm8, %ymm6
9288; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9289; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm8
9290; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9291; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm6
9292; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9293; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
9294; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm6, %xmm8
9295; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm9
9296; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7]
9297; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
9298; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9299; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
9300; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm8, %ymm13
9301; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
9302; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9303; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %ymm13
9304; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9305; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm12
9306; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9307; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
9308; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm12, %xmm13
9309; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm3
9310; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7]
9311; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
9312; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9313; AVX2-FP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
9314; AVX2-FP-NEXT:    # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
9315; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
9316; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
9317; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9318; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
9319; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
9320; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
9321; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
9322; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
9323; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm10, %ymm10
9324; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
9325; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9326; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
9327; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
9328; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
9329; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
9330; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
9331; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9332; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
9333; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
9334; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7]
9335; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm8, %ymm4
9336; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
9337; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9338; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm13, %ymm1
9339; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
9340; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
9341; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
9342; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
9343; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9344; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9345; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
9346; AVX2-FP-NEXT:    # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9347; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm0
9348; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
9349; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9350; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
9351; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm7, %xmm0
9352; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
9353; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
9354; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
9355; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
9356; AVX2-FP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9357; AVX2-FP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
9358; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9359; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
9360; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9361; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm0
9362; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9363; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm3
9364; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9365; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
9366; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
9367; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm11, %xmm3
9368; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm4
9369; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9370; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
9371; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
9372; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
9373; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
9374; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9375; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9376; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9377; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
9378; AVX2-FP-NEXT:    # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
9379; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm2
9380; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
9381; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9382; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm6, %xmm2
9383; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
9384; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
9385; AVX2-FP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
9386; AVX2-FP-NEXT:    # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
9387; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9388; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
9389; AVX2-FP-NEXT:    vmovdqa %ymm5, %ymm15
9390; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
9391; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm3
9392; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9393; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm4
9394; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9395; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
9396; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
9397; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm4
9398; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9399; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
9400; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
9401; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
9402; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
9403; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9404; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9405; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9406; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
9407; AVX2-FP-NEXT:    # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
9408; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm2
9409; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
9410; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9411; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm2
9412; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
9413; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
9414; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9415; AVX2-FP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
9416; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7]
9417; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9418; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm3, %ymm3
9419; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
9420; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %ymm3
9421; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9422; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm4
9423; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9424; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
9425; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm9
9426; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm3
9427; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9428; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm10
9429; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
9430; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
9431; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15]
9432; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
9433; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9434; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm2
9435; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9436; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm3
9437; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9438; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
9439; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm9
9440; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm13
9441; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
9442; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7]
9443; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9444; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9445; AVX2-FP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
9446; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm10
9447; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3]
9448; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm12
9449; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
9450; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7]
9451; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9452; AVX2-FP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
9453; AVX2-FP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
9454; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm10, %ymm15
9455; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
9456; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
9457; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15]
9458; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7]
9459; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9460; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
9461; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
9462; AVX2-FP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9463; AVX2-FP-NEXT:    # xmm9 = mem[1,1,1,1,4,5,6,7]
9464; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7]
9465; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
9466; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9467; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
9468; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
9469; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
9470; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
9471; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
9472; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9473; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
9474; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
9475; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7]
9476; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9477; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
9478; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9479; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9480; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm6, %xmm0
9481; AVX2-FP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9482; AVX2-FP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
9483; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
9484; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9485; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
9486; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
9487; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
9488; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9489; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
9490; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5]
9491; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
9492; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9493; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
9494; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9495; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9496; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm5, %xmm0
9497; AVX2-FP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9498; AVX2-FP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
9499; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
9500; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9501; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
9502; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
9503; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
9504; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9505; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
9506; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
9507; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7]
9508; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9509; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
9510; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9511; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9512; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm13, %xmm0
9513; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
9514; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
9515; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm10, %ymm1
9516; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
9517; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7]
9518; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
9519; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
9520; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
9521; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9522; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
9523; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9524; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9525; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
9526; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9527; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
9528; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9529; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9530; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9531; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9532; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
9533; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
9534; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
9535; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
9536; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9537; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
9538; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
9539; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
9540; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
9541; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9542; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
9543; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
9544; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
9545; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1]
9546; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
9547; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
9548; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm9, %xmm1
9549; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
9550; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
9551; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9552; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
9553; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
9554; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
9555; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
9556; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9557; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9558; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9559; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9560; AVX2-FP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9561; AVX2-FP-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
9562; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9563; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9564; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9565; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9566; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
9567; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9568; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
9569; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
9570; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9571; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
9572; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
9573; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
9574; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9575; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
9576; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
9577; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
9578; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
9579; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9580; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
9581; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9582; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
9583; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7]
9584; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
9585; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9586; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
9587; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
9588; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
9589; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9590; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9591; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9592; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9593; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
9594; AVX2-FP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
9595; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9596; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9597; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9598; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
9599; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
9600; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
9601; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm8, %xmm0
9602; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
9603; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
9604; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9605; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
9606; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
9607; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm0
9608; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1]
9609; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
9610; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm4, %xmm3
9611; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7]
9612; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7]
9613; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
9614; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm13, %ymm5
9615; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
9616; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
9617; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
9618; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
9619; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9620; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9621; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9622; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9623; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3]
9624; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm14, %xmm3
9625; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm2
9626; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
9627; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4]
9628; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
9629; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9630; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
9631; AVX2-FP-NEXT:    # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
9632; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9633; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9634; AVX2-FP-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9635; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm5
9636; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
9637; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
9638; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3]
9639; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7]
9640; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
9641; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
9642; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm15, %ymm3
9643; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
9644; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
9645; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
9646; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
9647; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9648; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
9649; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
9650; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
9651; AVX2-FP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9652; AVX2-FP-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,5]
9653; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
9654; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
9655; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
9656; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
9657; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
9658; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
9659; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9660; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm11, %ymm11
9661; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9662; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
9663; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
9664; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
9665; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
9666; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9667; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
9668; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
9669; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7]
9670; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
9671; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
9672; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
9673; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm13, %ymm4
9674; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9675; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
9676; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
9677; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
9678; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9679; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9680; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm1
9681; AVX2-FP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9682; AVX2-FP-NEXT:    # xmm4 = mem[0,1,2,3,7,5,6,5]
9683; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
9684; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9685; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
9686; AVX2-FP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9687; AVX2-FP-NEXT:    # xmm6 = mem[3,1,2,1,4,5,6,7]
9688; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7]
9689; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9690; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm6
9691; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9692; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
9693; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
9694; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
9695; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
9696; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
9697; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5]
9698; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
9699; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm15, %ymm2
9700; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
9701; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7]
9702; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
9703; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9704; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
9705; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
9706; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
9707; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9708; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9709; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
9710; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9711; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
9712; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
9713; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
9714; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
9715; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm4
9716; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
9717; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9718; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
9719; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
9720; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9721; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
9722; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9723; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm11
9724; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
9725; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm11, %xmm1
9726; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm12, %xmm5
9727; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7]
9728; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9729; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
9730; AVX2-FP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7]
9731; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9732; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
9733; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9734; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm13
9735; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
9736; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm13, %xmm1
9737; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm14, %xmm10
9738; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7]
9739; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9740; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
9741; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
9742; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9743; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
9744; AVX2-FP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9745; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm15
9746; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
9747; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm15, %xmm10
9748; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
9749; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7]
9750; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
9751; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
9752; AVX2-FP-NEXT:    # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
9753; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
9754; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm11, %xmm11
9755; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm12, %xmm12
9756; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
9757; AVX2-FP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
9758; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
9759; AVX2-FP-NEXT:    # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
9760; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
9761; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
9762; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
9763; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
9764; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9765; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
9766; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm13, %xmm6
9767; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm14, %xmm12
9768; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
9769; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
9770; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
9771; AVX2-FP-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
9772; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm15, %xmm12
9773; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
9774; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
9775; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9776; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9777; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
9778; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9779; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rsi)
9780; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9781; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
9782; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9783; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rsi)
9784; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9785; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
9786; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9787; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rdx)
9788; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9789; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rdx)
9790; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9791; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rdx)
9792; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9793; AVX2-FP-NEXT:    vmovaps %ymm7, (%rdx)
9794; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9795; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rcx)
9796; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9797; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rcx)
9798; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9799; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rcx)
9800; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9801; AVX2-FP-NEXT:    vmovaps %ymm7, (%rcx)
9802; AVX2-FP-NEXT:    vmovdqa %ymm8, 96(%r8)
9803; AVX2-FP-NEXT:    vmovdqa %ymm9, 32(%r8)
9804; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9805; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%r8)
9806; AVX2-FP-NEXT:    vmovdqa %ymm3, (%r8)
9807; AVX2-FP-NEXT:    vmovdqa %ymm10, 96(%r9)
9808; AVX2-FP-NEXT:    vmovdqa %ymm1, 32(%r9)
9809; AVX2-FP-NEXT:    vmovdqa %ymm5, (%r9)
9810; AVX2-FP-NEXT:    vmovdqa %ymm4, 64(%r9)
9811; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9812; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
9813; AVX2-FP-NEXT:    vmovdqa %ymm6, 32(%rax)
9814; AVX2-FP-NEXT:    vmovdqa %ymm2, 64(%rax)
9815; AVX2-FP-NEXT:    vmovdqa %ymm11, (%rax)
9816; AVX2-FP-NEXT:    addq $1304, %rsp # imm = 0x518
9817; AVX2-FP-NEXT:    vzeroupper
9818; AVX2-FP-NEXT:    retq
9819;
9820; AVX2-FCP-LABEL: load_i16_stride6_vf64:
9821; AVX2-FCP:       # %bb.0:
9822; AVX2-FCP-NEXT:    subq $1304, %rsp # imm = 0x518
9823; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
9824; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
9825; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm2
9826; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm3
9827; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm4
9828; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
9829; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
9830; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9831; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm9
9832; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9833; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm6
9834; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm7
9835; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
9836; AVX2-FCP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
9837; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
9838; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9839; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
9840; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9841; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
9842; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
9843; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9844; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
9845; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9846; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3]
9847; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9848; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
9849; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9850; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
9851; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
9852; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
9853; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
9854; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
9855; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7]
9856; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
9857; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
9858; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm7
9859; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
9860; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
9861; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9862; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm6
9863; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9864; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
9865; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9866; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
9867; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
9868; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
9869; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7]
9870; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
9871; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
9872; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm8
9873; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm8, %ymm6
9874; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9875; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
9876; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9877; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm6
9878; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9879; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
9880; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm8
9881; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm9
9882; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7]
9883; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
9884; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9885; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
9886; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm13
9887; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
9888; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9889; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm13
9890; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9891; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm12
9892; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9893; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
9894; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm13
9895; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm3
9896; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7]
9897; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
9898; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9899; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
9900; AVX2-FCP-NEXT:    # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
9901; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
9902; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
9903; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9904; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
9905; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
9906; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
9907; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
9908; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
9909; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm10
9910; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
9911; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9912; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
9913; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
9914; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
9915; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
9916; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
9917; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9918; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
9919; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
9920; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7]
9921; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm4
9922; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
9923; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9924; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm13, %ymm1
9925; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
9926; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
9927; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
9928; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
9929; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9930; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9931; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
9932; AVX2-FCP-NEXT:    # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9933; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm0
9934; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
9935; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9936; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
9937; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm0
9938; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
9939; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
9940; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
9941; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
9942; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9943; AVX2-FCP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
9944; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9945; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
9946; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9947; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm0
9948; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9949; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm3
9950; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9951; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
9952; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
9953; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm3
9954; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm4
9955; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9956; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
9957; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
9958; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
9959; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
9960; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9961; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9962; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9963; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
9964; AVX2-FCP-NEXT:    # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
9965; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm2
9966; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
9967; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9968; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm2
9969; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
9970; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
9971; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
9972; AVX2-FCP-NEXT:    # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
9973; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9974; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
9975; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm15
9976; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
9977; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm3
9978; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9979; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm4
9980; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9981; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
9982; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
9983; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm4
9984; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9985; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
9986; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
9987; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
9988; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
9989; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9990; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9991; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9992; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
9993; AVX2-FCP-NEXT:    # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
9994; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm2
9995; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
9996; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9997; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm2
9998; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
9999; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
10000; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10001; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
10002; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7]
10003; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10004; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm3
10005; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
10006; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm3
10007; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10008; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm4
10009; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10010; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
10011; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm9
10012; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm3
10013; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10014; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm10
10015; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
10016; AVX2-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10017; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15]
10018; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
10019; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10020; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm2
10021; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10022; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm3
10023; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10024; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
10025; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm9
10026; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm13
10027; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
10028; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7]
10029; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10030; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10031; AVX2-FCP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
10032; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm10
10033; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3]
10034; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm12
10035; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
10036; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7]
10037; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10038; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10039; AVX2-FCP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
10040; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm15
10041; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
10042; AVX2-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10043; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15]
10044; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7]
10045; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10046; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
10047; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
10048; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10049; AVX2-FCP-NEXT:    # xmm9 = mem[1,1,1,1,4,5,6,7]
10050; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7]
10051; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
10052; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10053; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
10054; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
10055; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
10056; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
10057; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
10058; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10059; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
10060; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
10061; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7]
10062; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10063; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10064; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10065; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10066; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm0
10067; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10068; AVX2-FCP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
10069; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
10070; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10071; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
10072; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
10073; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10074; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10075; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
10076; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5]
10077; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
10078; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10079; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10080; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10081; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10082; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm0
10083; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10084; AVX2-FCP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
10085; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
10086; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10087; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
10088; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
10089; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10090; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10091; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
10092; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
10093; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7]
10094; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10095; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10096; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10097; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10098; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm0
10099; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
10100; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
10101; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm1
10102; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
10103; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7]
10104; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
10105; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
10106; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
10107; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10108; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
10109; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10110; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10111; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
10112; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10113; AVX2-FCP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
10114; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10115; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10116; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10117; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10118; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
10119; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
10120; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
10121; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
10122; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10123; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
10124; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
10125; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
10126; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
10127; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10128; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10129; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
10130; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
10131; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1]
10132; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
10133; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
10134; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm1
10135; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
10136; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
10137; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10138; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
10139; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
10140; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
10141; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
10142; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
10143; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10144; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10145; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10146; AVX2-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10147; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
10148; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10149; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10150; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10151; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10152; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
10153; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10154; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
10155; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
10156; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10157; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
10158; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
10159; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
10160; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10161; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10162; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
10163; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
10164; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
10165; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10166; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
10167; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10168; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
10169; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7]
10170; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
10171; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10172; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
10173; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
10174; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
10175; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
10176; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10177; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10178; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10179; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
10180; AVX2-FCP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
10181; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10182; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10183; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10184; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
10185; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
10186; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
10187; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm0
10188; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
10189; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
10190; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10191; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10192; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
10193; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
10194; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1]
10195; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
10196; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm3
10197; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7]
10198; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7]
10199; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10200; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm13, %ymm5
10201; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
10202; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
10203; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
10204; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
10205; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10206; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10207; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10208; AVX2-FCP-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10209; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3]
10210; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm14, %xmm3
10211; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
10212; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
10213; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4]
10214; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
10215; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10216; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
10217; AVX2-FCP-NEXT:    # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
10218; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10219; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10220; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10221; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
10222; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
10223; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
10224; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3]
10225; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7]
10226; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
10227; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10228; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm15, %ymm3
10229; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
10230; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
10231; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
10232; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
10233; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10234; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
10235; AVX2-FCP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
10236; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
10237; AVX2-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10238; AVX2-FCP-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,5]
10239; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
10240; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
10241; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
10242; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
10243; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
10244; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
10245; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10246; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm11
10247; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10248; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
10249; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
10250; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
10251; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
10252; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10253; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
10254; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
10255; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7]
10256; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
10257; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
10258; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
10259; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm13, %ymm4
10260; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10261; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
10262; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
10263; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
10264; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10265; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10266; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm1
10267; AVX2-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10268; AVX2-FCP-NEXT:    # xmm4 = mem[0,1,2,3,7,5,6,5]
10269; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
10270; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10271; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
10272; AVX2-FCP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
10273; AVX2-FCP-NEXT:    # xmm6 = mem[3,1,2,1,4,5,6,7]
10274; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7]
10275; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10276; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm6
10277; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10278; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
10279; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
10280; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
10281; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
10282; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
10283; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5]
10284; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
10285; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm2
10286; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
10287; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7]
10288; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
10289; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10290; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
10291; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
10292; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
10293; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10294; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10295; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10296; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10297; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
10298; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
10299; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
10300; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
10301; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm4
10302; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
10303; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10304; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
10305; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
10306; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10307; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10308; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10309; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm11
10310; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
10311; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm1
10312; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm5
10313; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7]
10314; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10315; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
10316; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7]
10317; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10318; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10319; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10320; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm13
10321; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
10322; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm1
10323; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm10
10324; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7]
10325; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10326; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10327; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
10328; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10329; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10330; AVX2-FCP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10331; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm15
10332; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
10333; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm10
10334; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
10335; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7]
10336; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
10337; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
10338; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
10339; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
10340; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm11
10341; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm12
10342; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
10343; AVX2-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
10344; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
10345; AVX2-FCP-NEXT:    # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
10346; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
10347; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
10348; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
10349; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10350; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10351; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
10352; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm6
10353; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm12
10354; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
10355; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10356; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10357; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
10358; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm12
10359; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
10360; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
10361; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10362; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10363; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
10364; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10365; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rsi)
10366; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10367; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rsi)
10368; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10369; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rsi)
10370; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10371; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rsi)
10372; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10373; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rdx)
10374; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10375; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rdx)
10376; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10377; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rdx)
10378; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10379; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rdx)
10380; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10381; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rcx)
10382; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10383; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rcx)
10384; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10385; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rcx)
10386; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10387; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rcx)
10388; AVX2-FCP-NEXT:    vmovdqa %ymm8, 96(%r8)
10389; AVX2-FCP-NEXT:    vmovdqa %ymm9, 32(%r8)
10390; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10391; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%r8)
10392; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%r8)
10393; AVX2-FCP-NEXT:    vmovdqa %ymm10, 96(%r9)
10394; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%r9)
10395; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%r9)
10396; AVX2-FCP-NEXT:    vmovdqa %ymm4, 64(%r9)
10397; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10398; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
10399; AVX2-FCP-NEXT:    vmovdqa %ymm6, 32(%rax)
10400; AVX2-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
10401; AVX2-FCP-NEXT:    vmovdqa %ymm11, (%rax)
10402; AVX2-FCP-NEXT:    addq $1304, %rsp # imm = 0x518
10403; AVX2-FCP-NEXT:    vzeroupper
10404; AVX2-FCP-NEXT:    retq
10405;
10406; AVX512-LABEL: load_i16_stride6_vf64:
10407; AVX512:       # %bb.0:
10408; AVX512-NEXT:    subq $1480, %rsp # imm = 0x5C8
10409; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
10410; AVX512-NEXT:    vmovdqa 608(%rdi), %ymm0
10411; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10412; AVX512-NEXT:    vmovdqa 576(%rdi), %ymm1
10413; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10414; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
10415; AVX512-NEXT:    vpshufb %xmm9, %xmm1, %xmm0
10416; AVX512-NEXT:    vextracti32x4 $1, %ymm1, %xmm20
10417; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
10418; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm20[0,2,0,3]
10419; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
10420; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
10421; AVX512-NEXT:    vmovdqa 544(%rdi), %ymm1
10422; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10423; AVX512-NEXT:    vmovdqa 512(%rdi), %ymm2
10424; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10425; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
10426; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm2
10427; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
10428; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
10429; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm21
10430; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
10431; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
10432; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10433; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
10434; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10435; AVX512-NEXT:    vmovdqa 448(%rdi), %ymm0
10436; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm1
10437; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10438; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm2
10439; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10440; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
10441; AVX512-NEXT:    vpshufb %xmm9, %xmm12, %xmm1
10442; AVX512-NEXT:    vextracti32x4 $1, %ymm12, %xmm22
10443; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm22[0,2,0,3]
10444; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
10445; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
10446; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
10447; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10448; AVX512-NEXT:    vinserti128 $1, 480(%rdi), %ymm0, %ymm0
10449; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10450; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
10451; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
10452; AVX512-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
10453; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
10454; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
10455; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10456; AVX512-NEXT:    vmovdqa 640(%rdi), %ymm0
10457; AVX512-NEXT:    vmovdqa 736(%rdi), %ymm1
10458; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10459; AVX512-NEXT:    vmovdqa 704(%rdi), %ymm2
10460; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10461; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
10462; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm2
10463; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7]
10464; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
10465; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm28
10466; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
10467; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10468; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
10469; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10470; AVX512-NEXT:    vinserti128 $1, 672(%rdi), %ymm0, %ymm0
10471; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10472; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
10473; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
10474; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
10475; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
10476; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm29
10477; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10478; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
10479; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10480; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10481; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm0
10482; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10483; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm1
10484; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10485; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
10486; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
10487; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3]
10488; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
10489; AVX512-NEXT:    vpshufb %xmm9, %xmm14, %xmm1
10490; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
10491; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm1
10492; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
10493; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10494; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
10495; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm30
10496; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm7
10497; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
10498; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7]
10499; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
10500; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10501; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
10502; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10503; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
10504; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10505; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
10506; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10507; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
10508; AVX512-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
10509; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm9
10510; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
10511; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
10512; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
10513; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm2
10514; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
10515; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10516; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
10517; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10518; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
10519; AVX512-NEXT:    vpshufb %ymm5, %ymm8, %ymm2
10520; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10521; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10522; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm0
10523; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10524; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm1
10525; AVX512-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
10526; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
10527; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
10528; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7]
10529; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3,4],xmm6[5,6,7]
10530; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
10531; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm2
10532; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
10533; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10534; AVX512-NEXT:    vinserti128 $1, 288(%rdi), %ymm2, %ymm0
10535; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
10536; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm31
10537; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
10538; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
10539; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
10540; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
10541; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
10542; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
10543; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10544; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
10545; AVX512-NEXT:    vpshufb %xmm10, %xmm15, %xmm0
10546; AVX512-NEXT:    vpshufb %xmm10, %xmm14, %xmm14
10547; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7]
10548; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
10549; AVX512-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
10550; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
10551; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
10552; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
10553; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
10554; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10555; AVX512-NEXT:    vpshufb %xmm10, %xmm9, %xmm0
10556; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
10557; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
10558; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
10559; AVX512-NEXT:    vpshufb %ymm3, %ymm8, %ymm4
10560; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
10561; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10562; AVX512-NEXT:    vpshufb %xmm14, %xmm6, %xmm0
10563; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5]
10564; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
10565; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
10566; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
10567; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10568; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
10569; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
10570; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
10571; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm27
10572; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
10573; AVX512-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
10574; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm2
10575; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
10576; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
10577; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm2
10578; AVX512-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
10579; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
10580; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
10581; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10582; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
10583; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10584; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm0
10585; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
10586; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm2
10587; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
10588; AVX512-NEXT:    vpshufb %xmm10, %xmm12, %xmm3
10589; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
10590; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10591; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10592; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm0
10593; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
10594; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm1
10595; AVX512-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
10596; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5]
10597; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
10598; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10599; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10600; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
10601; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10602; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm26
10603; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10604; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10605; AVX512-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10606; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
10607; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
10608; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
10609; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
10610; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm28
10611; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
10612; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7]
10613; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm21
10614; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
10615; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm2
10616; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
10617; AVX512-NEXT:    # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
10618; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
10619; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
10620; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7]
10621; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm20
10622; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
10623; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
10624; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4]
10625; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm19
10626; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
10627; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10628; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm2
10629; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10630; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10631; AVX512-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10632; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
10633; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3]
10634; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
10635; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
10636; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm18
10637; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
10638; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7]
10639; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm16
10640; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
10641; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10642; AVX512-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
10643; AVX512-NEXT:    # ymm4 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
10644; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
10645; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
10646; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm17
10647; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
10648; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
10649; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10650; AVX512-NEXT:    vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
10651; AVX512-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
10652; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
10653; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3]
10654; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7]
10655; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
10656; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1]
10657; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4]
10658; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
10659; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10660; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm4
10661; AVX512-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
10662; AVX512-NEXT:    # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7]
10663; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
10664; AVX512-NEXT:    vpshufb %ymm5, %ymm13, %ymm4
10665; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm24
10666; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
10667; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
10668; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
10669; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
10670; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
10671; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm29 & (zmm1 ^ zmm2))
10672; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm22 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
10673; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm22 & (zmm3 ^ zmm1))
10674; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10675; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10676; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10677; AVX512-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
10678; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
10679; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
10680; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
10681; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7]
10682; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
10683; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7]
10684; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
10685; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10686; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10687; AVX512-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
10688; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
10689; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
10690; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7]
10691; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
10692; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1]
10693; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4]
10694; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
10695; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10696; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm23
10697; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10698; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10699; AVX512-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
10700; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
10701; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3]
10702; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1]
10703; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7]
10704; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
10705; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7]
10706; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
10707; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10708; AVX512-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
10709; AVX512-NEXT:    # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
10710; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
10711; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
10712; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10713; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10714; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10715; AVX512-NEXT:    # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10716; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
10717; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
10718; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
10719; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
10720; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
10721; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
10722; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
10723; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10724; AVX512-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10725; AVX512-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
10726; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm0
10727; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
10728; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
10729; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
10730; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
10731; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
10732; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
10733; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm23 ^ (zmm29 & (zmm2 ^ zmm23))
10734; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm2))
10735; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10736; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm0
10737; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
10738; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm2
10739; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
10740; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
10741; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
10742; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm2
10743; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
10744; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm14
10745; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
10746; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
10747; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
10748; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10749; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm28
10750; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm0
10751; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7]
10752; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
10753; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7]
10754; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
10755; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7]
10756; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
10757; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
10758; AVX512-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
10759; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
10760; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
10761; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5]
10762; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7]
10763; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
10764; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
10765; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
10766; AVX512-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
10767; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10768; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
10769; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
10770; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
10771; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
10772; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10773; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
10774; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10775; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm13
10776; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10777; AVX512-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm13
10778; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10779; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm25
10780; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm28 ^ (zmm29 & (zmm0 ^ zmm28))
10781; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm22 & (zmm25 ^ zmm0))
10782; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
10783; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
10784; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
10785; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
10786; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5]
10787; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7]
10788; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
10789; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7]
10790; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10791; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
10792; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7]
10793; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7]
10794; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
10795; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7]
10796; AVX512-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
10797; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
10798; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
10799; AVX512-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
10800; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
10801; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
10802; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
10803; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
10804; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10805; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
10806; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
10807; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
10808; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm28
10809; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm29 & (zmm2 ^ zmm0))
10810; AVX512-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm22 & (zmm28 ^ zmm2))
10811; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10812; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10813; AVX512-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
10814; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
10815; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm0
10816; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm4
10817; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7]
10818; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm24
10819; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
10820; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm4
10821; AVX512-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload
10822; AVX512-NEXT:    # ymm3 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7]
10823; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm5
10824; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1]
10825; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
10826; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm22
10827; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
10828; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
10829; AVX512-NEXT:    vpshufb %xmm13, %xmm5, %xmm4
10830; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm20
10831; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
10832; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10833; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm30
10834; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10835; AVX512-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
10836; AVX512-NEXT:    # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
10837; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10838; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10839; AVX512-NEXT:    # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
10840; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
10841; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm4
10842; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7]
10843; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm21
10844; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
10845; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
10846; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
10847; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm4
10848; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm26
10849; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm29) | ymm4
10850; AVX512-NEXT:    movw $31, %ax
10851; AVX512-NEXT:    kmovw %eax, %k1
10852; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm30 {%k1}
10853; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm3
10854; AVX512-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
10855; AVX512-NEXT:    # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7]
10856; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10857; AVX512-NEXT:    vpblendd $109, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
10858; AVX512-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7]
10859; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm6
10860; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1]
10861; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
10862; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm27
10863; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
10864; AVX512-NEXT:    vpshufb %xmm13, %xmm6, %xmm4
10865; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm18
10866; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
10867; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10868; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
10869; AVX512-NEXT:    vpshufb %ymm6, %ymm5, %ymm4
10870; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm16
10871; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm19
10872; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
10873; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm23
10874; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10875; AVX512-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
10876; AVX512-NEXT:    # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7]
10877; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10878; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload
10879; AVX512-NEXT:    # ymm15 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
10880; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm3
10881; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm14
10882; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7]
10883; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
10884; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
10885; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm17
10886; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm29) | ymm0
10887; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10888; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
10889; AVX512-NEXT:    # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
10890; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
10891; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm9
10892; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[2,2,2,2,4,5,6,7]
10893; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
10894; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10895; AVX512-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10896; AVX512-NEXT:    # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
10897; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm12
10898; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1]
10899; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7]
10900; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
10901; AVX512-NEXT:    vpshufb %xmm13, %xmm12, %xmm6
10902; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
10903; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10904; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm31
10905; AVX512-NEXT:    vmovdqa32 %zmm4, %zmm31 {%k1}
10906; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10907; AVX512-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
10908; AVX512-NEXT:    # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
10909; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10910; AVX512-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10911; AVX512-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10912; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm7
10913; AVX512-NEXT:    vpshufb %xmm13, %xmm7, %xmm3
10914; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1]
10915; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
10916; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
10917; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
10918; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm0
10919; AVX512-NEXT:    vpshufb %ymm0, %ymm8, %ymm3
10920; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10921; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7]
10922; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm16
10923; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
10924; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
10925; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3]
10926; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
10927; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
10928; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
10929; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
10930; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
10931; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
10932; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
10933; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
10934; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7]
10935; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10936; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
10937; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
10938; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm21[1,1,2,3]
10939; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
10940; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6,7]
10941; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
10942; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
10943; AVX512-NEXT:    vpshufb %ymm10, %ymm0, %ymm13
10944; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm29) | ymm13
10945; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k1}
10946; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
10947; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
10948; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm0
10949; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
10950; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
10951; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7]
10952; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
10953; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm0
10954; AVX512-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
10955; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10956; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
10957; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm3
10958; AVX512-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
10959; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm10
10960; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3]
10961; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
10962; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7]
10963; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm29) | ymm3
10964; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
10965; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
10966; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
10967; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
10968; AVX512-NEXT:    vinserti64x4 $1, %ymm23, %zmm0, %zmm3
10969; AVX512-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm5
10970; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
10971; AVX512-NEXT:    vpshufb %xmm6, %xmm12, %xmm9
10972; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
10973; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
10974; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7]
10975; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10976; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm9, %zmm1
10977; AVX512-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k1}
10978; AVX512-NEXT:    vpshufb %ymm13, %ymm8, %ymm8
10979; AVX512-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
10980; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
10981; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3]
10982; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7]
10983; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
10984; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
10985; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
10986; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
10987; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10988; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
10989; AVX512-NEXT:    # zmm7 = mem ^ (zmm6 & (zmm7 ^ mem))
10990; AVX512-NEXT:    movw $-2048, %ax # imm = 0xF800
10991; AVX512-NEXT:    kmovw %eax, %k1
10992; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10993; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
10994; AVX512-NEXT:    vmovdqa64 %zmm7, (%rsi)
10995; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10996; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
10997; AVX512-NEXT:    # zmm7 = mem ^ (zmm6 & (zmm7 ^ mem))
10998; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10999; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
11000; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
11001; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11002; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
11003; AVX512-NEXT:    # zmm7 = mem ^ (zmm6 & (zmm7 ^ mem))
11004; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11005; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
11006; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rdx)
11007; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11008; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
11009; AVX512-NEXT:    # zmm7 = mem ^ (zmm6 & (zmm7 ^ mem))
11010; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11011; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k1}
11012; AVX512-NEXT:    vmovdqa64 %zmm7, (%rdx)
11013; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm6 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
11014; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm6 & (zmm3 ^ zmm30))
11015; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm31))
11016; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2))
11017; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm6 & (zmm4 ^ zmm1))
11018; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11019; AVX512-NEXT:    vmovaps %zmm1, 64(%rcx)
11020; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11021; AVX512-NEXT:    vmovaps %zmm1, (%rcx)
11022; AVX512-NEXT:    vmovdqa64 %zmm28, 64(%r8)
11023; AVX512-NEXT:    vmovdqa64 %zmm25, (%r8)
11024; AVX512-NEXT:    vmovdqa64 %zmm5, 64(%r9)
11025; AVX512-NEXT:    vmovdqa64 %zmm3, (%r9)
11026; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11027; AVX512-NEXT:    vmovdqa64 %zmm4, 64(%rax)
11028; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
11029; AVX512-NEXT:    addq $1480, %rsp # imm = 0x5C8
11030; AVX512-NEXT:    vzeroupper
11031; AVX512-NEXT:    retq
11032;
11033; AVX512-FCP-LABEL: load_i16_stride6_vf64:
11034; AVX512-FCP:       # %bb.0:
11035; AVX512-FCP-NEXT:    subq $1416, %rsp # imm = 0x588
11036; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
11037; AVX512-FCP-NEXT:    vmovdqa 608(%rdi), %ymm0
11038; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11039; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
11040; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11041; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
11042; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm0
11043; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm1
11044; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
11045; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
11046; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
11047; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
11048; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
11049; AVX512-FCP-NEXT:    vmovdqa 544(%rdi), %ymm1
11050; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11051; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm2
11052; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11053; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
11054; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
11055; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm1
11056; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm3
11057; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm2
11058; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm22
11059; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
11060; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11061; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
11062; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11063; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm0
11064; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm1
11065; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11066; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm2
11067; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11068; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
11069; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm1
11070; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm2
11071; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
11072; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
11073; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
11074; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
11075; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
11076; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11077; AVX512-FCP-NEXT:    vinserti128 $1, 480(%rdi), %ymm0, %ymm0
11078; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11079; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
11080; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
11081; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
11082; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
11083; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
11084; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11085; AVX512-FCP-NEXT:    vmovdqa 736(%rdi), %ymm0
11086; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11087; AVX512-FCP-NEXT:    vmovdqa 704(%rdi), %ymm1
11088; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11089; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
11090; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm0
11091; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm2
11092; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
11093; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm26
11094; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
11095; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm1
11096; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11097; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
11098; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11099; AVX512-FCP-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
11100; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11101; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
11102; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
11103; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
11104; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
11105; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm29
11106; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
11107; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
11108; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11109; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11110; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
11111; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11112; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm1
11113; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11114; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
11115; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm0
11116; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm14, %xmm1
11117; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[2,1,0,3]
11118; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm15, %xmm0
11119; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
11120; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
11121; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11122; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
11123; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
11124; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
11125; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
11126; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
11127; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm2
11128; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
11129; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11130; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
11131; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11132; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
11133; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11134; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
11135; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11136; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
11137; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm0
11138; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
11139; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
11140; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm0
11141; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
11142; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
11143; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
11144; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11145; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
11146; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11147; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
11148; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm2
11149; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
11150; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11151; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
11152; AVX512-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
11153; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm1
11154; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11155; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
11156; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
11157; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
11158; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm10
11159; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7]
11160; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
11161; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
11162; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11163; AVX512-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm2, %ymm0
11164; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
11165; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm31
11166; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
11167; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
11168; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
11169; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
11170; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
11171; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11172; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11173; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
11174; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
11175; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm14
11176; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2],xmm0[3],xmm14[4,5],xmm0[6,7]
11177; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
11178; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
11179; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
11180; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
11181; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
11182; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm0
11183; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11184; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm0
11185; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm3
11186; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7]
11187; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
11188; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm5
11189; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
11190; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11191; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm0
11192; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
11193; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
11194; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
11195; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
11196; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11197; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
11198; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
11199; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11200; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm28
11201; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
11202; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
11203; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
11204; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
11205; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
11206; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
11207; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
11208; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
11209; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
11210; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11211; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
11212; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11213; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
11214; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
11215; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm2
11216; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
11217; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
11218; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
11219; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
11220; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11221; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
11222; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
11223; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
11224; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
11225; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5]
11226; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
11227; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11228; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
11229; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
11230; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11231; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
11232; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11233; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11234; AVX512-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11235; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
11236; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3]
11237; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
11238; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1]
11239; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
11240; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
11241; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7]
11242; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
11243; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
11244; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
11245; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11246; AVX512-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
11247; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
11248; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
11249; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
11250; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
11251; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm21
11252; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
11253; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
11254; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm20
11255; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
11256; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11257; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm3
11258; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11259; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11260; AVX512-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
11261; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
11262; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
11263; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
11264; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
11265; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm1
11266; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
11267; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7]
11268; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm18
11269; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7]
11270; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11271; AVX512-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
11272; AVX512-FCP-NEXT:    # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
11273; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
11274; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm5
11275; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm17
11276; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
11277; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11278; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11279; AVX512-FCP-NEXT:    vpblendd $219, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload
11280; AVX512-FCP-NEXT:    # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
11281; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
11282; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3]
11283; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm5
11284; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm16
11285; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1]
11286; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4]
11287; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7]
11288; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
11289; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm6
11290; AVX512-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
11291; AVX512-FCP-NEXT:    # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7]
11292; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
11293; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm6
11294; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm22
11295; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
11296; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
11297; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
11298; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
11299; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
11300; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm29 & (zmm4 ^ zmm3))
11301; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
11302; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm26 & (zmm5 ^ zmm4))
11303; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11304; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11305; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11306; AVX512-FCP-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
11307; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
11308; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1]
11309; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
11310; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[2,1,2,3]
11311; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7]
11312; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
11313; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11314; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11315; AVX512-FCP-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
11316; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
11317; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3]
11318; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm3
11319; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
11320; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4]
11321; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
11322; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
11323; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm11
11324; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11325; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11326; AVX512-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11327; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
11328; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1]
11329; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
11330; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
11331; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7]
11332; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
11333; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11334; AVX512-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
11335; AVX512-FCP-NEXT:    # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
11336; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
11337; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
11338; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11339; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11340; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11341; AVX512-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11342; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
11343; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
11344; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm0
11345; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
11346; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4]
11347; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
11348; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11349; AVX512-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
11350; AVX512-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
11351; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm0
11352; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
11353; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
11354; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15]
11355; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
11356; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
11357; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm25
11358; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm11 ^ (zmm29 & (zmm1 ^ zmm11))
11359; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm26 & (zmm25 ^ zmm1))
11360; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
11361; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
11362; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
11363; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
11364; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm1
11365; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
11366; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
11367; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
11368; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm11
11369; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
11370; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm13
11371; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5]
11372; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7]
11373; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
11374; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm11, %zmm24
11375; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
11376; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
11377; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm11
11378; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm19
11379; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm0
11380; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7]
11381; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7]
11382; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
11383; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
11384; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
11385; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7]
11386; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
11387; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm13
11388; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm13
11389; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5]
11390; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7]
11391; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
11392; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm15
11393; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
11394; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15]
11395; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5]
11396; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
11397; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm23
11398; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm29 & (zmm0 ^ zmm24))
11399; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm26 & (zmm23 ^ zmm0))
11400; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm0
11401; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
11402; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
11403; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11404; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
11405; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11406; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm0, %zmm10
11407; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11408; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm10
11409; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11410; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm12[3,1,2,1,4,5,6,7]
11411; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7]
11412; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
11413; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5]
11414; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
11415; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
11416; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm8, %zmm0
11417; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm8
11418; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
11419; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
11420; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
11421; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
11422; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
11423; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
11424; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
11425; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
11426; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
11427; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
11428; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11429; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
11430; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
11431; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11432; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm28
11433; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm29 & (zmm5 ^ zmm0))
11434; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm26 & (zmm28 ^ zmm5))
11435; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11436; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
11437; AVX512-FCP-NEXT:    # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11438; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
11439; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm0
11440; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm3
11441; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
11442; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
11443; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
11444; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
11445; AVX512-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11446; AVX512-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
11447; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
11448; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
11449; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
11450; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm2
11451; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm22
11452; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm4, %xmm3
11453; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm21
11454; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
11455; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11456; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm30
11457; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11458; AVX512-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
11459; AVX512-FCP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
11460; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11461; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
11462; AVX512-FCP-NEXT:    # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11463; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
11464; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
11465; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
11466; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
11467; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
11468; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
11469; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
11470; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm17
11471; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm29) | ymm3
11472; AVX512-FCP-NEXT:    movw $31, %ax
11473; AVX512-FCP-NEXT:    kmovw %eax, %k1
11474; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm30 {%k1}
11475; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm2
11476; AVX512-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
11477; AVX512-FCP-NEXT:    # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
11478; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11479; AVX512-FCP-NEXT:    vpblendd $146, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
11480; AVX512-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
11481; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
11482; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1]
11483; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm2
11484; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm20
11485; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm3
11486; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
11487; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
11488; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11489; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
11490; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm3
11491; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm16
11492; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
11493; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
11494; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
11495; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11496; AVX512-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
11497; AVX512-FCP-NEXT:    # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
11498; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11499; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
11500; AVX512-FCP-NEXT:    # ymm6 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
11501; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm3
11502; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm13
11503; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7]
11504; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
11505; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
11506; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm29) | ymm0
11507; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11508; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
11509; AVX512-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11510; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
11511; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm7
11512; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[2,2,2,2,4,5,6,7]
11513; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
11514; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11515; AVX512-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
11516; AVX512-FCP-NEXT:    # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
11517; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm9
11518; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1]
11519; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm4
11520; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm5
11521; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7]
11522; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
11523; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm31
11524; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm31 {%k1}
11525; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11526; AVX512-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11527; AVX512-FCP-NEXT:    # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
11528; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11529; AVX512-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
11530; AVX512-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
11531; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
11532; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
11533; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm4
11534; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm12
11535; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7]
11536; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
11537; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm12
11538; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
11539; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm4[5,6,7]
11540; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
11541; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
11542; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
11543; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm10
11544; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7]
11545; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
11546; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
11547; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
11548; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm17
11549; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm29) | ymm10
11550; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
11551; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm10
11552; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm11
11553; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
11554; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
11555; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm0
11556; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
11557; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm12
11558; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
11559; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7]
11560; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11561; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm0
11562; AVX512-FCP-NEXT:    vmovdqa32 %zmm14, %zmm0 {%k1}
11563; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm11
11564; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
11565; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm12
11566; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
11567; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
11568; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
11569; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm14
11570; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm14
11571; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
11572; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7]
11573; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm14
11574; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
11575; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm13, %xmm13
11576; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
11577; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7]
11578; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm29) | ymm14
11579; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
11580; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
11581; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7]
11582; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm4
11583; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm7
11584; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
11585; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
11586; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
11587; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7]
11588; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
11589; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm8, %zmm2
11590; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm2 {%k1}
11591; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm5
11592; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
11593; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
11594; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
11595; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11596; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7]
11597; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
11598; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
11599; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11600; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
11601; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
11602; AVX512-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
11603; AVX512-FCP-NEXT:    kmovw %eax, %k1
11604; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11605; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm5 {%k1}
11606; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rsi)
11607; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11608; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
11609; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
11610; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11611; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm5 {%k1}
11612; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rsi)
11613; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11614; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
11615; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
11616; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11617; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm5 {%k1}
11618; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
11619; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11620; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
11621; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
11622; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
11623; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm5 {%k1}
11624; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
11625; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm26 & (zmm4 ^ zmm30))
11626; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm31))
11627; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm26 & (zmm11 ^ zmm0))
11628; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm26 & (zmm1 ^ zmm2))
11629; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 64(%rcx)
11630; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11631; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rcx)
11632; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 64(%r8)
11633; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
11634; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 64(%r9)
11635; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
11636; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11637; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
11638; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
11639; AVX512-FCP-NEXT:    addq $1416, %rsp # imm = 0x588
11640; AVX512-FCP-NEXT:    vzeroupper
11641; AVX512-FCP-NEXT:    retq
11642;
11643; AVX512DQ-LABEL: load_i16_stride6_vf64:
11644; AVX512DQ:       # %bb.0:
11645; AVX512DQ-NEXT:    subq $840, %rsp # imm = 0x348
11646; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
11647; AVX512DQ-NEXT:    vmovdqa 608(%rdi), %ymm0
11648; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11649; AVX512DQ-NEXT:    vmovdqa 576(%rdi), %ymm1
11650; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11651; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
11652; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm1, %xmm0
11653; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm1, %xmm24
11654; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm25
11655; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm24[0,2,0,3]
11656; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
11657; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
11658; AVX512DQ-NEXT:    vmovdqa 544(%rdi), %ymm1
11659; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11660; AVX512DQ-NEXT:    vmovdqa 512(%rdi), %ymm2
11661; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11662; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
11663; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
11664; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7]
11665; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
11666; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
11667; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm22
11668; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
11669; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
11670; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11671; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
11672; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %ymm1
11673; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm2
11674; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11675; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm3
11676; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11677; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
11678; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm15, %xmm2
11679; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm15, %xmm21
11680; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3]
11681; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
11682; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
11683; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3]
11684; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11685; AVX512DQ-NEXT:    vinserti128 $1, 480(%rdi), %ymm1, %ymm1
11686; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11687; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
11688; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
11689; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm3, %ymm1
11690; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
11691; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
11692; AVX512DQ-NEXT:    vmovdqa 640(%rdi), %ymm1
11693; AVX512DQ-NEXT:    vmovdqa 736(%rdi), %ymm2
11694; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11695; AVX512DQ-NEXT:    vmovdqa 704(%rdi), %ymm4
11696; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11697; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
11698; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm4
11699; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7]
11700; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
11701; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm19
11702; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
11703; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11704; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3]
11705; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11706; AVX512DQ-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
11707; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11708; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3,4,5],ymm4[6],ymm1[7]
11709; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
11710; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm4, %ymm1
11711; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm27
11712; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm18
11713; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
11714; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
11715; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
11716; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
11717; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0))
11718; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
11719; AVX512DQ-NEXT:    kmovw %eax, %k1
11720; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
11721; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11722; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm0
11723; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11724; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm1
11725; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11726; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
11727; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
11728; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
11729; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
11730; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
11731; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
11732; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm1
11733; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11734; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
11735; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11736; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
11737; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm11
11738; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
11739; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7]
11740; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
11741; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11742; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
11743; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
11744; AVX512DQ-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
11745; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
11746; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11747; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
11748; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
11749; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm9
11750; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
11751; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
11752; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
11753; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm2
11754; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
11755; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
11756; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
11757; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm31
11758; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm30
11759; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm2
11760; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm2[3,4,5,6,7]
11761; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm0
11762; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11763; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm1
11764; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11765; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
11766; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm8
11767; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7]
11768; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4],xmm8[5,6,7]
11769; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
11770; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm2
11771; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
11772; AVX512DQ-NEXT:    vinserti128 $1, 288(%rdi), %ymm2, %ymm0
11773; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
11774; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm26
11775; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm29
11776; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm0
11777; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
11778; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
11779; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
11780; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
11781; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11782; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm17 ^ (zmm16 & (zmm7 ^ zmm17))
11783; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
11784; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11785; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
11786; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
11787; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm13, %xmm13
11788; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7]
11789; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
11790; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm11, %xmm7
11791; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
11792; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
11793; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
11794; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
11795; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm9, %xmm4
11796; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
11797; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7]
11798; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
11799; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
11800; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7]
11801; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm8, %xmm3
11802; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5]
11803; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
11804; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
11805; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
11806; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11807; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
11808; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
11809; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11810; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0))
11811; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
11812; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11813; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
11814; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
11815; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm1
11816; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
11817; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
11818; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm1
11819; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
11820; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
11821; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
11822; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
11823; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11824; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
11825; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm1
11826; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
11827; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
11828; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
11829; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm15, %xmm4
11830; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7]
11831; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
11832; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm1
11833; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
11834; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm2
11835; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
11836; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
11837; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
11838; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11839; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
11840; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
11841; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
11842; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0))
11843; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
11844; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11845; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11846; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11847; AVX512DQ-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11848; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
11849; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
11850; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
11851; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
11852; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm20
11853; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
11854; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7]
11855; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm22
11856; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
11857; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11858; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11859; AVX512DQ-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
11860; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
11861; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
11862; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7]
11863; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm23
11864; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
11865; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
11866; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4]
11867; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm24
11868; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
11869; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11870; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm2
11871; AVX512DQ-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
11872; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11873; AVX512DQ-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11874; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
11875; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3]
11876; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
11877; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
11878; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm25
11879; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
11880; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7]
11881; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm16
11882; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
11883; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
11884; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm3
11885; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2],ymm0[3],ymm3[4],ymm0[5,6],ymm3[7]
11886; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
11887; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
11888; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm17
11889; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
11890; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
11891; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11892; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11893; AVX512DQ-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
11894; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
11895; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3]
11896; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7]
11897; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
11898; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1]
11899; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4]
11900; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
11901; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
11902; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm4
11903; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm5
11904; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7]
11905; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
11906; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm13, %ymm4
11907; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm27
11908; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
11909; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
11910; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
11911; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
11912; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
11913; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm21 & (zmm1 ^ zmm2))
11914; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm18 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
11915; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ zmm1))
11916; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11917; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11918; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11919; AVX512DQ-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
11920; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
11921; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
11922; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
11923; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7]
11924; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
11925; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7]
11926; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
11927; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11928; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11929; AVX512DQ-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
11930; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
11931; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
11932; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7]
11933; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
11934; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1]
11935; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4]
11936; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
11937; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11938; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm19
11939; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11940; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11941; AVX512DQ-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
11942; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
11943; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3]
11944; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1]
11945; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7]
11946; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
11947; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7]
11948; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
11949; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11950; AVX512DQ-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
11951; AVX512DQ-NEXT:    # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
11952; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
11953; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
11954; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11955; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11956; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
11957; AVX512DQ-NEXT:    # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11958; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
11959; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
11960; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
11961; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
11962; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
11963; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
11964; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
11965; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11966; AVX512DQ-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
11967; AVX512DQ-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
11968; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm0
11969; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
11970; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
11971; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
11972; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
11973; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
11974; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm28
11975; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm19 ^ (zmm21 & (zmm2 ^ zmm19))
11976; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm2))
11977; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
11978; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
11979; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
11980; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
11981; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
11982; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
11983; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
11984; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
11985; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm14
11986; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
11987; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
11988; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
11989; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11990; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm20
11991; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm0
11992; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7]
11993; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm0
11994; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7]
11995; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
11996; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7]
11997; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
11998; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm0
11999; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
12000; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
12001; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12002; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5]
12003; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7]
12004; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
12005; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
12006; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
12007; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
12008; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12009; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
12010; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
12011; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
12012; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm27
12013; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm21 & (zmm0 ^ zmm20))
12014; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm18 & (zmm27 ^ zmm0))
12015; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
12016; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
12017; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
12018; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
12019; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5]
12020; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7]
12021; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
12022; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7]
12023; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12024; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
12025; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7]
12026; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7]
12027; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
12028; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7]
12029; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
12030; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
12031; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
12032; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
12033; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
12034; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
12035; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
12036; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
12037; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12038; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
12039; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
12040; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
12041; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
12042; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm21 & (zmm2 ^ zmm0))
12043; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm18 & (zmm20 ^ zmm2))
12044; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12045; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
12046; AVX512DQ-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12047; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
12048; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
12049; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm13
12050; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7]
12051; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
12052; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12053; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12054; AVX512DQ-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
12055; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm5
12056; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
12057; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
12058; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm19
12059; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
12060; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
12061; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm5, %xmm4
12062; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm18
12063; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
12064; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12065; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm23
12066; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
12067; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm2
12068; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
12069; AVX512DQ-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
12070; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
12071; AVX512DQ-NEXT:    # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
12072; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm0
12073; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm4
12074; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
12075; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm30
12076; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
12077; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
12078; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
12079; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm4
12080; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm31
12081; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm22) | ymm4
12082; AVX512DQ-NEXT:    movw $31, %ax
12083; AVX512DQ-NEXT:    kmovw %eax, %k1
12084; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1}
12085; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm2
12086; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm4
12087; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0],ymm2[1],ymm4[2,3,4,5],ymm2[6],ymm4[7]
12088; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12089; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12090; AVX512DQ-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
12091; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm6
12092; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
12093; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
12094; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
12095; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
12096; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm6, %xmm4
12097; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm16
12098; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
12099; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12100; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
12101; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm5, %ymm4
12102; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm21
12103; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm17
12104; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
12105; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm24
12106; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12107; AVX512DQ-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
12108; AVX512DQ-NEXT:    # ymm14 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
12109; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12110; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
12111; AVX512DQ-NEXT:    # ymm15 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
12112; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm2
12113; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm12
12114; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7]
12115; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
12116; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
12117; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm22) | ymm0
12118; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12119; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
12120; AVX512DQ-NEXT:    # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12121; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
12122; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm10
12123; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[2,2,2,2,4,5,6,7]
12124; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
12125; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12126; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12127; AVX512DQ-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
12128; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm8
12129; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1]
12130; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7]
12131; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
12132; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm8, %xmm5
12133; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7]
12134; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
12135; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm25
12136; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm25 {%k1}
12137; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12138; AVX512DQ-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
12139; AVX512DQ-NEXT:    # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
12140; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12141; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
12142; AVX512DQ-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
12143; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm4
12144; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm4, %xmm11
12145; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1]
12146; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,1,0,2,4,5,6,7]
12147; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
12148; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6,7]
12149; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
12150; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
12151; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12152; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7]
12153; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm21
12154; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
12155; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
12156; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3]
12157; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
12158; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7]
12159; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
12160; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
12161; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
12162; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm13
12163; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
12164; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
12165; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7]
12166; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12167; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm26
12168; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
12169; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[1,1,2,3]
12170; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
12171; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
12172; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
12173; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
12174; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm0, %ymm13
12175; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm22) | ymm13
12176; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1}
12177; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm0
12178; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm9
12179; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
12180; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
12181; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
12182; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6,7]
12183; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
12184; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm0
12185; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
12186; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
12187; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7]
12188; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
12189; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm14, %ymm9
12190; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm11
12191; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3]
12192; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
12193; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7]
12194; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = (ymm11 & ymm22) | ymm9
12195; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
12196; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3]
12197; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
12198; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7]
12199; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm6
12200; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
12201; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
12202; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7]
12203; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
12204; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm6, %zmm1
12205; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm11, %zmm0, %zmm1 {%k1}
12206; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm5, %ymm5
12207; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
12208; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
12209; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
12210; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7]
12211; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12212; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
12213; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
12214; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12215; AVX512DQ-NEXT:    vmovaps %zmm3, (%rsi)
12216; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12217; AVX512DQ-NEXT:    vmovaps %zmm3, 64(%rsi)
12218; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12219; AVX512DQ-NEXT:    vmovaps %zmm3, 64(%rdx)
12220; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12221; AVX512DQ-NEXT:    vmovaps %zmm3, (%rdx)
12222; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
12223; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm3 & (zmm24 ^ zmm23))
12224; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm3 & (zmm21 ^ zmm25))
12225; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm3 & (zmm0 ^ zmm26))
12226; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm1))
12227; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
12228; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
12229; AVX512DQ-NEXT:    vmovaps %zmm1, (%rcx)
12230; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 64(%r8)
12231; AVX512DQ-NEXT:    vmovdqa64 %zmm27, (%r8)
12232; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 64(%r9)
12233; AVX512DQ-NEXT:    vmovdqa64 %zmm24, (%r9)
12234; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12235; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rax)
12236; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
12237; AVX512DQ-NEXT:    addq $840, %rsp # imm = 0x348
12238; AVX512DQ-NEXT:    vzeroupper
12239; AVX512DQ-NEXT:    retq
12240;
12241; AVX512DQ-FCP-LABEL: load_i16_stride6_vf64:
12242; AVX512DQ-FCP:       # %bb.0:
12243; AVX512DQ-FCP-NEXT:    subq $872, %rsp # imm = 0x368
12244; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
12245; AVX512DQ-FCP-NEXT:    vmovdqa 608(%rdi), %ymm0
12246; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12247; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
12248; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12249; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
12250; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm0
12251; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm1
12252; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
12253; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
12254; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
12255; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
12256; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
12257; AVX512DQ-FCP-NEXT:    vmovdqa 544(%rdi), %ymm1
12258; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12259; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm2
12260; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12261; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
12262; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
12263; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
12264; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
12265; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
12266; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm2
12267; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm22
12268; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
12269; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12270; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
12271; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
12272; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm2
12273; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12274; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm3
12275; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12276; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
12277; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm2
12278; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm3
12279; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3]
12280; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm3
12281; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm21
12282; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
12283; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3]
12284; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12285; AVX512DQ-FCP-NEXT:    vinserti128 $1, 480(%rdi), %ymm1, %ymm1
12286; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12287; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
12288; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
12289; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm1
12290; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
12291; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
12292; AVX512DQ-FCP-NEXT:    vmovdqa 736(%rdi), %ymm1
12293; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12294; AVX512DQ-FCP-NEXT:    vmovdqa 704(%rdi), %ymm2
12295; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12296; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
12297; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm1
12298; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm4
12299; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
12300; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
12301; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
12302; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm2
12303; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12304; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3]
12305; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12306; AVX512DQ-FCP-NEXT:    vinserti128 $1, 672(%rdi), %ymm2, %ymm2
12307; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12308; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
12309; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
12310; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm2
12311; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm26
12312; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
12313; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
12314; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
12315; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12316; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12317; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0))
12318; AVX512DQ-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
12319; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
12320; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
12321; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12322; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
12323; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12324; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm1
12325; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12326; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
12327; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm0
12328; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
12329; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
12330; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm0
12331; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
12332; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
12333; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12334; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
12335; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
12336; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
12337; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
12338; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
12339; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm2
12340; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
12341; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12342; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
12343; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
12344; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12345; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
12346; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12347; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
12348; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm0
12349; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm2
12350; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[2,1,0,3]
12351; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm0
12352; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
12353; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
12354; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
12355; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12356; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
12357; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
12358; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
12359; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm2
12360; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3,4,5,6,7]
12361; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
12362; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12363; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm1
12364; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12365; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
12366; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
12367; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm8
12368; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
12369; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7]
12370; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
12371; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
12372; AVX512DQ-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm2, %ymm0
12373; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
12374; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
12375; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm30
12376; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
12377; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
12378; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
12379; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
12380; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
12381; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
12382; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm17 ^ (zmm16 & (zmm6 ^ zmm17))
12383; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
12384; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12385; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
12386; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm0
12387; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm13
12388; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm13[2],xmm0[3],xmm13[4,5],xmm0[6,7]
12389; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
12390; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm7
12391; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
12392; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
12393; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
12394; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm0
12395; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
12396; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm5
12397; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7]
12398; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
12399; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm9, %ymm6
12400; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7]
12401; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm8, %xmm3
12402; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
12403; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
12404; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
12405; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
12406; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12407; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
12408; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
12409; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12410; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0))
12411; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
12412; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12413; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
12414; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
12415; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm1
12416; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
12417; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
12418; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm1
12419; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
12420; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
12421; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
12422; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
12423; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12424; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
12425; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
12426; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
12427; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm2
12428; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm4
12429; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
12430; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7]
12431; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
12432; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
12433; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
12434; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
12435; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
12436; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
12437; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
12438; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12439; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
12440; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
12441; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12442; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0))
12443; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
12444; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12445; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12446; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12447; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
12448; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
12449; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3]
12450; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
12451; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1]
12452; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
12453; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7]
12454; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
12455; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12456; AVX512DQ-FCP-NEXT:    vpblendd $36, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
12457; AVX512DQ-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
12458; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
12459; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
12460; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
12461; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
12462; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
12463; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
12464; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
12465; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm21
12466; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
12467; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12468; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm3
12469; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12470; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12471; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
12472; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
12473; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
12474; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
12475; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
12476; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm1
12477; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm23
12478; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7]
12479; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm22
12480; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7]
12481; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
12482; AVX512DQ-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12483; AVX512DQ-FCP-NEXT:    # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
12484; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
12485; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm5
12486; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm24
12487; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
12488; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
12489; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12490; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12491; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
12492; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
12493; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3]
12494; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm5
12495; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm25
12496; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1]
12497; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4]
12498; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm18
12499; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7]
12500; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
12501; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm6
12502; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm7
12503; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7]
12504; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
12505; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm6
12506; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm26
12507; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm17
12508; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
12509; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
12510; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
12511; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
12512; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12513; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm20 & (zmm4 ^ zmm3))
12514; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
12515; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm28 & (zmm5 ^ zmm4))
12516; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12517; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12518; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12519; AVX512DQ-FCP-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
12520; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
12521; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
12522; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
12523; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm16
12524; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm3[2,1,2,3]
12525; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7]
12526; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
12527; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12528; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12529; AVX512DQ-FCP-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
12530; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
12531; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
12532; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm3
12533; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
12534; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4]
12535; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
12536; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12537; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm9
12538; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12539; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12540; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
12541; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
12542; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1]
12543; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
12544; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3]
12545; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7]
12546; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
12547; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12548; AVX512DQ-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
12549; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7]
12550; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
12551; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
12552; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12553; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12554; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12555; AVX512DQ-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12556; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
12557; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
12558; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
12559; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1]
12560; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4]
12561; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7]
12562; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12563; AVX512DQ-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
12564; AVX512DQ-FCP-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
12565; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
12566; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
12567; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
12568; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
12569; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
12570; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
12571; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm27
12572; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm20 & (zmm1 ^ zmm9))
12573; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm28 & (zmm27 ^ zmm1))
12574; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
12575; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
12576; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
12577; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7]
12578; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
12579; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
12580; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm1
12581; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm9
12582; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm1
12583; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5]
12584; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7]
12585; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
12586; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm9, %zmm21
12587; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
12588; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
12589; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm9
12590; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm0
12591; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7]
12592; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7]
12593; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
12594; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
12595; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
12596; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7]
12597; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12598; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm0
12599; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm15
12600; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm0
12601; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5]
12602; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
12603; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
12604; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
12605; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
12606; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
12607; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
12608; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5]
12609; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
12610; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm19
12611; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm21 ^ (zmm20 & (zmm1 ^ zmm21))
12612; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm1))
12613; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
12614; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
12615; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
12616; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7]
12617; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
12618; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
12619; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5]
12620; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7]
12621; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12622; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
12623; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm1
12624; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
12625; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7]
12626; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
12627; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
12628; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
12629; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
12630; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
12631; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
12632; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
12633; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12634; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
12635; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
12636; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12637; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm21
12638; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm20 & (zmm1 ^ zmm0))
12639; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm28 & (zmm21 ^ zmm1))
12640; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12641; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
12642; AVX512DQ-FCP-NEXT:    # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12643; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
12644; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm0
12645; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
12646; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7]
12647; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
12648; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12649; AVX512DQ-FCP-NEXT:    vpblendd $109, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
12650; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
12651; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
12652; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
12653; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
12654; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm2
12655; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
12656; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
12657; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm17
12658; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
12659; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12660; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm22
12661; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
12662; AVX512DQ-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
12663; AVX512DQ-FCP-NEXT:    # ymm14 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
12664; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12665; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
12666; AVX512DQ-FCP-NEXT:    # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12667; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
12668; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm15
12669; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7]
12670; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
12671; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
12672; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
12673; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm3
12674; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm20) | ymm3
12675; AVX512DQ-FCP-NEXT:    movw $31, %ax
12676; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
12677; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1}
12678; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
12679; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
12680; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7]
12681; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12682; AVX512DQ-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12683; AVX512DQ-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
12684; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
12685; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1]
12686; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm0
12687; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
12688; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm5, %xmm3
12689; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm29
12690; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7]
12691; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12692; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
12693; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm3
12694; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm26
12695; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm30
12696; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
12697; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm23
12698; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12699; AVX512DQ-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
12700; AVX512DQ-FCP-NEXT:    # ymm6 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
12701; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12702; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
12703; AVX512DQ-FCP-NEXT:    # ymm5 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12704; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
12705; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm0
12706; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[2,2,2,2,4,5,6,7]
12707; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm25
12708; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
12709; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
12710; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm31
12711; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
12712; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12713; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
12714; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
12715; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
12716; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm8
12717; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[2,2,2,2,4,5,6,7]
12718; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
12719; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12720; AVX512DQ-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12721; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
12722; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm7
12723; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[0,3,2,1]
12724; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm4
12725; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm12
12726; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7]
12727; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
12728; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm24
12729; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1}
12730; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12731; AVX512DQ-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12732; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
12733; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12734; AVX512DQ-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
12735; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
12736; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
12737; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
12738; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm12
12739; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm13
12740; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5],xmm12[6,7]
12741; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
12742; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm13
12743; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
12744; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7]
12745; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm13
12746; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
12747; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm15, %xmm15
12748; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
12749; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6,7]
12750; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
12751; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm14
12752; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm26
12753; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm14
12754; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm14
12755; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
12756; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7]
12757; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
12758; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
12759; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
12760; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm15
12761; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
12762; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7]
12763; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12764; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm0, %zmm0
12765; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm10, %zmm0, %zmm0 {%k1}
12766; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm10
12767; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
12768; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm14
12769; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm14
12770; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7]
12771; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
12772; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm15
12773; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm15
12774; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
12775; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7]
12776; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
12777; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm15
12778; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm11
12779; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm15, %ymm15
12780; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm11
12781; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
12782; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
12783; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
12784; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm20) | ymm15
12785; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
12786; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
12787; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7]
12788; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
12789; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
12790; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6,7]
12791; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
12792; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm6, %zmm2
12793; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm5, %zmm0, %zmm2 {%k1}
12794; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
12795; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
12796; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
12797; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
12798; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12799; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
12800; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
12801; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12802; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rsi)
12803; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12804; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rsi)
12805; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12806; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rdx)
12807; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12808; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rdx)
12809; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm28 & (zmm23 ^ zmm22))
12810; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm28 & (zmm13 ^ zmm24))
12811; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm28 & (zmm10 ^ zmm0))
12812; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm28 & (zmm1 ^ zmm2))
12813; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, 64(%rcx)
12814; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12815; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rcx)
12816; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 64(%r8)
12817; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%r8)
12818; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 64(%r9)
12819; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%r9)
12820; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12821; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
12822; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%rax)
12823; AVX512DQ-FCP-NEXT:    addq $872, %rsp # imm = 0x368
12824; AVX512DQ-FCP-NEXT:    vzeroupper
12825; AVX512DQ-FCP-NEXT:    retq
12826;
12827; AVX512BW-LABEL: load_i16_stride6_vf64:
12828; AVX512BW:       # %bb.0:
12829; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12830; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm3
12831; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
12832; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm0
12833; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm5
12834; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm4
12835; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm6
12836; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm10
12837; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm13
12838; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
12839; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm12
12840; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm9
12841; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm11
12842; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
12843; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
12844; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm15
12845; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
12846; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
12847; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
12848; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm17
12849; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
12850; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
12851; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
12852; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm7, %zmm8
12853; AVX512BW-NEXT:    movl $4192256, %edi # imm = 0x3FF800
12854; AVX512BW-NEXT:    kmovd %edi, %k1
12855; AVX512BW-NEXT:    vmovdqu16 %zmm17, %zmm8 {%k1}
12856; AVX512BW-NEXT:    movw $-2048, %di # imm = 0xF800
12857; AVX512BW-NEXT:    kmovd %edi, %k2
12858; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm8 {%k2}
12859; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm14
12860; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm5, %zmm16
12861; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
12862; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm7 {%k1}
12863; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm7 {%k2}
12864; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
12865; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
12866; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm17
12867; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm16, %zmm17
12868; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
12869; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
12870; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm19
12871; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm18, %zmm19
12872; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
12873; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15
12874; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm14, %zmm15
12875; AVX512BW-NEXT:    vmovdqu16 %zmm19, %zmm15 {%k1}
12876; AVX512BW-NEXT:    vmovdqa32 %zmm17, %zmm15 {%k2}
12877; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm16
12878; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm5, %zmm18
12879; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm14
12880; AVX512BW-NEXT:    vmovdqu16 %zmm18, %zmm14 {%k1}
12881; AVX512BW-NEXT:    vmovdqa32 %zmm16, %zmm14 {%k2}
12882; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
12883; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
12884; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm19
12885; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
12886; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
12887; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
12888; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm21
12889; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm20, %zmm21
12890; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
12891; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm17
12892; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
12893; AVX512BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
12894; AVX512BW-NEXT:    kmovd %edi, %k2
12895; AVX512BW-NEXT:    vmovdqu16 %zmm21, %zmm17 {%k2}
12896; AVX512BW-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
12897; AVX512BW-NEXT:    kmovd %edi, %k1
12898; AVX512BW-NEXT:    vmovdqu16 %zmm19, %zmm17 {%k1}
12899; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm18
12900; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm20
12901; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm16
12902; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm16 {%k2}
12903; AVX512BW-NEXT:    vmovdqu16 %zmm18, %zmm16 {%k1}
12904; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
12905; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
12906; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm21
12907; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm20, %zmm21
12908; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
12909; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
12910; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23
12911; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
12912; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
12913; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm19
12914; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm18, %zmm19
12915; AVX512BW-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
12916; AVX512BW-NEXT:    vmovdqu16 %zmm21, %zmm19 {%k1}
12917; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm20
12918; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm22
12919; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm18
12920; AVX512BW-NEXT:    vmovdqu16 %zmm22, %zmm18 {%k2}
12921; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm18 {%k1}
12922; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
12923; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm21
12924; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm21
12925; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
12926; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
12927; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23
12928; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
12929; AVX512BW-NEXT:    movw $31, %di
12930; AVX512BW-NEXT:    kmovd %edi, %k2
12931; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k2}
12932; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
12933; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
12934; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm24
12935; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm21, %zmm24
12936; AVX512BW-NEXT:    vmovdqu16 %zmm24, %zmm23 {%k1}
12937; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm4, %zmm21
12938; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm22
12939; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm20
12940; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k2}
12941; AVX512BW-NEXT:    vmovdqu16 %zmm21, %zmm22 {%k1}
12942; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
12943; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm10
12944; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
12945; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
12946; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm13, %zmm1
12947; AVX512BW-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k2}
12948; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
12949; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
12950; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm10, %zmm9
12951; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm1 {%k1}
12952; AVX512BW-NEXT:    vpermt2w %zmm6, %zmm10, %zmm4
12953; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm13, %zmm0
12954; AVX512BW-NEXT:    vpermt2w %zmm3, %zmm20, %zmm2
12955; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k2}
12956; AVX512BW-NEXT:    vmovdqu16 %zmm4, %zmm0 {%k1}
12957; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
12958; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
12959; AVX512BW-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
12960; AVX512BW-NEXT:    vmovdqa64 %zmm15, (%rdx)
12961; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
12962; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rcx)
12963; AVX512BW-NEXT:    vmovdqa64 %zmm18, 64(%r8)
12964; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%r8)
12965; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r9)
12966; AVX512BW-NEXT:    vmovdqa64 %zmm23, (%r9)
12967; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
12968; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
12969; AVX512BW-NEXT:    vzeroupper
12970; AVX512BW-NEXT:    retq
12971;
12972; AVX512BW-FCP-LABEL: load_i16_stride6_vf64:
12973; AVX512BW-FCP:       # %bb.0:
12974; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12975; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm3
12976; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm2
12977; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm0
12978; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm5
12979; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm4
12980; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm6
12981; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
12982; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
12983; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
12984; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm12
12985; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm9
12986; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm11
12987; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
12988; AVX512BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
12989; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm15
12990; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
12991; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
12992; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
12993; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm17
12994; AVX512BW-FCP-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
12995; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
12996; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm8
12997; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm7, %zmm8
12998; AVX512BW-FCP-NEXT:    movl $4192256, %edi # imm = 0x3FF800
12999; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
13000; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm8 {%k1}
13001; AVX512BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
13002; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
13003; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm8 {%k2}
13004; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm6, %zmm14
13005; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm5, %zmm16
13006; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
13007; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm7 {%k1}
13008; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm14, %zmm7 {%k2}
13009; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
13010; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
13011; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm17
13012; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm16, %zmm17
13013; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
13014; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
13015; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm19
13016; AVX512BW-FCP-NEXT:    vpermt2w %zmm1, %zmm18, %zmm19
13017; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
13018; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm15
13019; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm14, %zmm15
13020; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm19, %zmm15 {%k1}
13021; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm15 {%k2}
13022; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm6, %zmm16
13023; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm5, %zmm18
13024; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm14
13025; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm18, %zmm14 {%k1}
13026; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm14 {%k2}
13027; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
13028; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
13029; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
13030; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
13031; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
13032; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
13033; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm21
13034; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm20, %zmm21
13035; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
13036; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm17
13037; AVX512BW-FCP-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
13038; AVX512BW-FCP-NEXT:    movl $2095104, %edi # imm = 0x1FF800
13039; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
13040; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm17 {%k2}
13041; AVX512BW-FCP-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
13042; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
13043; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm19, %zmm17 {%k1}
13044; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm6, %zmm18
13045; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm20
13046; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm16
13047; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm16 {%k2}
13048; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm18, %zmm16 {%k1}
13049; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
13050; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
13051; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm21
13052; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm20, %zmm21
13053; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
13054; AVX512BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
13055; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23
13056; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
13057; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
13058; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm19
13059; AVX512BW-FCP-NEXT:    vpermt2w %zmm10, %zmm18, %zmm19
13060; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
13061; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm19 {%k1}
13062; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm6, %zmm20
13063; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm22
13064; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm18
13065; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm18 {%k2}
13066; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm18 {%k1}
13067; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
13068; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm21
13069; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm20, %zmm21
13070; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
13071; AVX512BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
13072; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23
13073; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
13074; AVX512BW-FCP-NEXT:    movw $31, %di
13075; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
13076; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k2}
13077; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
13078; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
13079; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
13080; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm21, %zmm24
13081; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm24, %zmm23 {%k1}
13082; AVX512BW-FCP-NEXT:    vpermi2w %zmm6, %zmm4, %zmm21
13083; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm22
13084; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm20
13085; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k2}
13086; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm22 {%k1}
13087; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
13088; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm20, %zmm10
13089; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
13090; AVX512BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
13091; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm13, %zmm1
13092; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k2}
13093; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
13094; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
13095; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm10, %zmm9
13096; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm1 {%k1}
13097; AVX512BW-FCP-NEXT:    vpermt2w %zmm6, %zmm10, %zmm4
13098; AVX512BW-FCP-NEXT:    vpermt2w %zmm5, %zmm13, %zmm0
13099; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm20, %zmm2
13100; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k2}
13101; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm0 {%k1}
13102; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
13103; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
13104; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
13105; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rdx)
13106; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
13107; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rcx)
13108; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 64(%r8)
13109; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, (%r8)
13110; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r9)
13111; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r9)
13112; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
13113; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
13114; AVX512BW-FCP-NEXT:    vzeroupper
13115; AVX512BW-FCP-NEXT:    retq
13116;
13117; AVX512DQ-BW-LABEL: load_i16_stride6_vf64:
13118; AVX512DQ-BW:       # %bb.0:
13119; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13120; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm3
13121; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
13122; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm0
13123; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm5
13124; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm4
13125; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm6
13126; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm10
13127; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm13
13128; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
13129; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm12
13130; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm9
13131; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm11
13132; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
13133; AVX512DQ-BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
13134; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm15
13135; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
13136; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
13137; AVX512DQ-BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
13138; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm17
13139; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
13140; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
13141; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm8
13142; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm7, %zmm8
13143; AVX512DQ-BW-NEXT:    movl $4192256, %edi # imm = 0x3FF800
13144; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
13145; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm17, %zmm8 {%k1}
13146; AVX512DQ-BW-NEXT:    movw $-2048, %di # imm = 0xF800
13147; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
13148; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm8 {%k2}
13149; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm14
13150; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm5, %zmm16
13151; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
13152; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm7 {%k1}
13153; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm14, %zmm7 {%k2}
13154; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
13155; AVX512DQ-BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
13156; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm17
13157; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm16, %zmm17
13158; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
13159; AVX512DQ-BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
13160; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm19
13161; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm18, %zmm19
13162; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
13163; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm15
13164; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm14, %zmm15
13165; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm19, %zmm15 {%k1}
13166; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm17, %zmm15 {%k2}
13167; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm16
13168; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm5, %zmm18
13169; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm14
13170; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm18, %zmm14 {%k1}
13171; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm16, %zmm14 {%k2}
13172; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
13173; AVX512DQ-BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
13174; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm19
13175; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
13176; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
13177; AVX512DQ-BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
13178; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm21
13179; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm20, %zmm21
13180; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
13181; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm17
13182; AVX512DQ-BW-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
13183; AVX512DQ-BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
13184; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
13185; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm21, %zmm17 {%k2}
13186; AVX512DQ-BW-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
13187; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
13188; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm19, %zmm17 {%k1}
13189; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm18
13190; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm20
13191; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm16
13192; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm20, %zmm16 {%k2}
13193; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm18, %zmm16 {%k1}
13194; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
13195; AVX512DQ-BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
13196; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm21
13197; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm20, %zmm21
13198; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
13199; AVX512DQ-BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
13200; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm23
13201; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
13202; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
13203; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm19
13204; AVX512DQ-BW-NEXT:    vpermt2w %zmm10, %zmm18, %zmm19
13205; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
13206; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm21, %zmm19 {%k1}
13207; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm20
13208; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm22
13209; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm18
13210; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm22, %zmm18 {%k2}
13211; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm20, %zmm18 {%k1}
13212; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
13213; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm21
13214; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm21
13215; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
13216; AVX512DQ-BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
13217; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm23
13218; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
13219; AVX512DQ-BW-NEXT:    movw $31, %di
13220; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
13221; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k2}
13222; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
13223; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
13224; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm24
13225; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm21, %zmm24
13226; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm24, %zmm23 {%k1}
13227; AVX512DQ-BW-NEXT:    vpermi2w %zmm6, %zmm4, %zmm21
13228; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm0, %zmm22
13229; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm20
13230; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k2}
13231; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm21, %zmm22 {%k1}
13232; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
13233; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm10
13234; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
13235; AVX512DQ-BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
13236; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm13, %zmm1
13237; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k2}
13238; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
13239; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
13240; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm10, %zmm9
13241; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm1 {%k1}
13242; AVX512DQ-BW-NEXT:    vpermt2w %zmm6, %zmm10, %zmm4
13243; AVX512DQ-BW-NEXT:    vpermt2w %zmm5, %zmm13, %zmm0
13244; AVX512DQ-BW-NEXT:    vpermt2w %zmm3, %zmm20, %zmm2
13245; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k2}
13246; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm4, %zmm0 {%k1}
13247; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
13248; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
13249; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
13250; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, (%rdx)
13251; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
13252; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, (%rcx)
13253; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 64(%r8)
13254; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, (%r8)
13255; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, 64(%r9)
13256; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, (%r9)
13257; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
13258; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%rax)
13259; AVX512DQ-BW-NEXT:    vzeroupper
13260; AVX512DQ-BW-NEXT:    retq
13261;
13262; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf64:
13263; AVX512DQ-BW-FCP:       # %bb.0:
13264; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13265; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm3
13266; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm2
13267; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm0
13268; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm5
13269; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm4
13270; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm6
13271; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
13272; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
13273; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
13274; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm12
13275; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm9
13276; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm11
13277; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
13278; AVX512DQ-BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
13279; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm15
13280; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
13281; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
13282; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
13283; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm17
13284; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
13285; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
13286; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm8
13287; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm7, %zmm8
13288; AVX512DQ-BW-FCP-NEXT:    movl $4192256, %edi # imm = 0x3FF800
13289; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
13290; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm8 {%k1}
13291; AVX512DQ-BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
13292; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
13293; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm8 {%k2}
13294; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm6, %zmm14
13295; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm5, %zmm16
13296; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
13297; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm7 {%k1}
13298; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm14, %zmm7 {%k2}
13299; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
13300; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
13301; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm17
13302; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm16, %zmm17
13303; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
13304; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
13305; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm19
13306; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm1, %zmm18, %zmm19
13307; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
13308; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm15
13309; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm14, %zmm15
13310; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm19, %zmm15 {%k1}
13311; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm15 {%k2}
13312; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm6, %zmm16
13313; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm5, %zmm18
13314; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm14
13315; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm18, %zmm14 {%k1}
13316; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm14 {%k2}
13317; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
13318; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
13319; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
13320; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
13321; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
13322; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
13323; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm21
13324; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm20, %zmm21
13325; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
13326; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm17
13327; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
13328; AVX512DQ-BW-FCP-NEXT:    movl $2095104, %edi # imm = 0x1FF800
13329; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
13330; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm17 {%k2}
13331; AVX512DQ-BW-FCP-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
13332; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
13333; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm19, %zmm17 {%k1}
13334; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm6, %zmm18
13335; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm20
13336; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm16
13337; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm16 {%k2}
13338; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm18, %zmm16 {%k1}
13339; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
13340; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
13341; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm21
13342; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm20, %zmm21
13343; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
13344; AVX512DQ-BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
13345; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23
13346; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
13347; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
13348; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm19
13349; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm10, %zmm18, %zmm19
13350; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
13351; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm19 {%k1}
13352; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm6, %zmm20
13353; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm22
13354; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm18
13355; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm18 {%k2}
13356; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm18 {%k1}
13357; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
13358; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm21
13359; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm20, %zmm21
13360; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
13361; AVX512DQ-BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
13362; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23
13363; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
13364; AVX512DQ-BW-FCP-NEXT:    movw $31, %di
13365; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
13366; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm23 {%k2}
13367; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
13368; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
13369; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
13370; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm21, %zmm24
13371; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm24, %zmm23 {%k1}
13372; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm6, %zmm4, %zmm21
13373; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm0, %zmm22
13374; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm20
13375; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k2}
13376; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm22 {%k1}
13377; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
13378; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm20, %zmm10
13379; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
13380; AVX512DQ-BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
13381; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm13, %zmm1
13382; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k2}
13383; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
13384; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
13385; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm10, %zmm9
13386; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm1 {%k1}
13387; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm6, %zmm10, %zmm4
13388; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm5, %zmm13, %zmm0
13389; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm20, %zmm2
13390; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k2}
13391; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm0 {%k1}
13392; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
13393; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rsi)
13394; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
13395; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rdx)
13396; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
13397; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rcx)
13398; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 64(%r8)
13399; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, (%r8)
13400; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 64(%r9)
13401; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r9)
13402; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
13403; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
13404; AVX512DQ-BW-FCP-NEXT:    vzeroupper
13405; AVX512DQ-BW-FCP-NEXT:    retq
13406  %wide.vec = load <384 x i16>, ptr %in.vec, align 64
13407  %strided.vec0 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378>
13408  %strided.vec1 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187, i32 193, i32 199, i32 205, i32 211, i32 217, i32 223, i32 229, i32 235, i32 241, i32 247, i32 253, i32 259, i32 265, i32 271, i32 277, i32 283, i32 289, i32 295, i32 301, i32 307, i32 313, i32 319, i32 325, i32 331, i32 337, i32 343, i32 349, i32 355, i32 361, i32 367, i32 373, i32 379>
13409  %strided.vec2 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188, i32 194, i32 200, i32 206, i32 212, i32 218, i32 224, i32 230, i32 236, i32 242, i32 248, i32 254, i32 260, i32 266, i32 272, i32 278, i32 284, i32 290, i32 296, i32 302, i32 308, i32 314, i32 320, i32 326, i32 332, i32 338, i32 344, i32 350, i32 356, i32 362, i32 368, i32 374, i32 380>
13410  %strided.vec3 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189, i32 195, i32 201, i32 207, i32 213, i32 219, i32 225, i32 231, i32 237, i32 243, i32 249, i32 255, i32 261, i32 267, i32 273, i32 279, i32 285, i32 291, i32 297, i32 303, i32 309, i32 315, i32 321, i32 327, i32 333, i32 339, i32 345, i32 351, i32 357, i32 363, i32 369, i32 375, i32 381>
13411  %strided.vec4 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190, i32 196, i32 202, i32 208, i32 214, i32 220, i32 226, i32 232, i32 238, i32 244, i32 250, i32 256, i32 262, i32 268, i32 274, i32 280, i32 286, i32 292, i32 298, i32 304, i32 310, i32 316, i32 322, i32 328, i32 334, i32 340, i32 346, i32 352, i32 358, i32 364, i32 370, i32 376, i32 382>
13412  %strided.vec5 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191, i32 197, i32 203, i32 209, i32 215, i32 221, i32 227, i32 233, i32 239, i32 245, i32 251, i32 257, i32 263, i32 269, i32 275, i32 281, i32 287, i32 293, i32 299, i32 305, i32 311, i32 317, i32 323, i32 329, i32 335, i32 341, i32 347, i32 353, i32 359, i32 365, i32 371, i32 377, i32 383>
13413  store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
13414  store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
13415  store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
13416  store <64 x i16> %strided.vec3, ptr %out.vec3, align 64
13417  store <64 x i16> %strided.vec4, ptr %out.vec4, align 64
13418  store <64 x i16> %strided.vec5, ptr %out.vec5, align 64
13419  ret void
13420}
13421