Lines Matching +full:0 +full:x1c8
20 ; SSE: # %bb.0:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
24 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
25 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
28 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
30 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
32 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
35 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
38 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
48 ; AVX: # %bb.0:
49 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
52 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
53 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
56 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
57 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
59 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
62 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
65 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
75 ; AVX2: # %bb.0:
76 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
79 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
80 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
83 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
84 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
86 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
89 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
92 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
102 ; AVX2-FP: # %bb.0:
103 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
106 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
107 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
110 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
111 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
115 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
118 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
128 ; AVX2-FCP: # %bb.0:
129 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
132 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
133 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
136 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
137 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
141 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
144 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
154 ; AVX512: # %bb.0:
155 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
158 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
159 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
162 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
163 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
165 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
168 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
171 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
181 ; AVX512-FCP: # %bb.0:
182 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
185 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
186 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
189 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
190 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
194 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
197 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
207 ; AVX512DQ: # %bb.0:
208 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
211 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
212 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
215 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
216 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
218 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
221 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
224 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
234 ; AVX512DQ-FCP: # %bb.0:
235 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
238 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
239 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
242 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
243 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
247 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
250 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
260 ; AVX512BW: # %bb.0:
261 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
264 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
265 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
268 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
269 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
271 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
274 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
277 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
287 ; AVX512BW-FCP: # %bb.0:
288 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
291 ; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
292 ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
295 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
300 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
303 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
314 ; AVX512DQ-BW: # %bb.0:
315 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
318 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
319 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
322 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
323 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
325 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
328 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
331 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
341 ; AVX512DQ-BW-FCP: # %bb.0:
342 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
345 ; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
346 ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
349 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
354 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
357 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
367 %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
384 ; SSE: # %bb.0:
388 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
389 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
391 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
392 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
397 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
402 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3]
407 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0]
408 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3]
409 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7]
410 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
411 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7]
414 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
415 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
416 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
418 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
419 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
425 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
426 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7]
427 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
430 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
432 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
442 ; AVX: # %bb.0:
443 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
447 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
448 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
450 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
451 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
452 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
454 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
456 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
458 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
459 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
460 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7]
461 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7]
465 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
466 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
467 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
468 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
471 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
473 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
483 ; AVX2: # %bb.0:
484 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
488 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
489 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
490 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
492 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
493 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
495 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
496 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
497 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
498 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
500 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
501 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
502 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
513 ; AVX2-FP: # %bb.0:
514 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
518 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
519 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
520 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
524 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
525 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
526 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
527 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
529 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
530 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
531 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
542 ; AVX2-FCP: # %bb.0:
543 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
547 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
548 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
549 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
553 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
554 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
555 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
556 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
558 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
559 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
560 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
571 ; AVX512: # %bb.0:
572 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
576 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
577 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
578 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
580 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
581 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
583 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
586 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
588 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
591 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
603 ; AVX512-FCP: # %bb.0:
604 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
608 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
609 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
610 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
614 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
617 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
619 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
622 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
634 ; AVX512DQ: # %bb.0:
635 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
639 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
640 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
641 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
643 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
644 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
646 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
649 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
651 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
654 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
666 ; AVX512DQ-FCP: # %bb.0:
667 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
671 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
672 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
673 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
677 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
680 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
682 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
685 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
697 ; AVX512BW: # %bb.0:
698 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
699 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
702 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
704 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
706 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
708 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
710 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
722 ; AVX512BW-FCP: # %bb.0:
723 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
724 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
727 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
729 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
731 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
733 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
735 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
747 ; AVX512DQ-BW: # %bb.0:
748 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
749 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
752 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
754 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
756 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
758 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
760 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
772 ; AVX512DQ-BW-FCP: # %bb.0:
773 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
774 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
777 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
779 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
781 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
783 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
785 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
796 %strided.vec0 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18>
813 ; SSE: # %bb.0:
814 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
821 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,1,2,4,5,6,7]
822 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535]
825 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
826 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7]
832 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
834 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[3,0]
836 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
837 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3]
840 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
841 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3]
842 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,1,0,2,4,5,6,7]
844 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0]
847 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
852 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7]
854 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,0]
857 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,0,3]
859 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0]
860 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535]
864 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm5[0,0]
865 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[2,3]
866 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7]
867 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
868 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[1,0,2,3,4,5,6,7]
871 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
873 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[0,2]
874 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7]
875 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
876 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,5,4]
882 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm12[0]
884 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3]
885 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
890 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7]
891 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,1,0,2]
897 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
898 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,5,4,6]
900 ; SSE-NEXT: movss {{.*#+}} xmm10 = xmm11[0],xmm10[1,2,3]
902 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[0,2,2,3,4,5,6,7]
903 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
904 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6]
910 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
912 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7]
914 ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
917 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
918 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
930 ; AVX: # %bb.0:
931 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
937 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3]
938 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
939 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
940 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3]
941 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7]
943 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7]
948 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
949 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
951 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
952 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
954 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
956 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
958 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
959 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
962 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
963 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
964 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
965 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7]
966 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7]
967 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
968 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm11[5,6,7]
971 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm12[0],xmm11[0]
973 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3,4],xmm9[5,6,7]
975 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
978 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
979 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
980 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,4,6]
982 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7]
983 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
984 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
985 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
988 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
990 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7]
992 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
994 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
1004 ; AVX2: # %bb.0:
1005 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1012 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1013 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1014 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
1016 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
1017 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
1018 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
1019 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
1023 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7]
1025 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1026 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1027 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1029 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1031 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1032 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
1033 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
1034 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1035 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1036 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1037 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1039 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
1040 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
1041 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1043 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1044 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1047 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1048 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1049 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1050 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1051 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
1054 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
1055 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
1057 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1068 ; AVX2-FP: # %bb.0:
1069 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1072 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
1073 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
1076 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
1078 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7]
1083 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1084 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3]
1088 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
1090 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
1091 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1092 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
1094 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1096 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1097 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
1098 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1099 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
1100 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1101 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1104 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1106 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1107 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
1110 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1111 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1112 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
1113 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1114 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
1118 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
1120 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
1131 ; AVX2-FCP: # %bb.0:
1132 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1135 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
1136 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
1139 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
1141 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7]
1146 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1147 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3]
1151 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
1153 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
1154 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1155 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
1157 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1159 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1160 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
1161 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1162 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
1163 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1164 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1167 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1169 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1170 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
1173 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1174 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1175 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
1176 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1177 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
1181 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
1183 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
1194 ; AVX512: # %bb.0:
1195 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1200 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1203 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1204 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
1206 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
1207 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
1208 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
1209 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
1211 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1212 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
1215 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
1216 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
1217 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
1218 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
1219 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1220 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1222 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1224 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1225 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
1226 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
1227 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1228 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1229 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1230 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1232 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
1233 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
1234 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1236 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1237 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1240 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1241 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1242 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1243 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1244 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
1247 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
1248 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
1250 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1261 ; AVX512-FCP: # %bb.0:
1262 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1263 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u]
1266 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1269 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
1271 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
1276 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1277 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3]
1279 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
1282 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7]
1283 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
1284 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
1285 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1287 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1289 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1290 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
1291 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1292 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
1293 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1294 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1297 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1299 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1300 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1303 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1304 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1305 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
1306 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1307 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
1311 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
1313 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
1324 ; AVX512DQ: # %bb.0:
1325 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1330 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1333 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1334 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
1336 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
1337 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
1338 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
1339 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
1341 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1342 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
1345 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
1346 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
1347 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
1348 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
1349 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
1350 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1352 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1354 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1355 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
1356 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
1357 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1358 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1359 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1360 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1362 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
1363 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
1364 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1366 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1367 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1370 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1371 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1372 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1373 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1374 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
1377 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
1378 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
1380 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1391 ; AVX512DQ-FCP: # %bb.0:
1392 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1393 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u]
1396 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1399 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
1401 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
1406 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1407 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3]
1409 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
1412 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7]
1413 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
1414 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
1415 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1417 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
1419 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
1420 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
1421 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
1422 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
1423 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
1424 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
1427 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
1429 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
1430 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1433 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
1434 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
1435 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
1436 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
1437 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
1441 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
1443 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
1454 ; AVX512BW: # %bb.0:
1455 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1458 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42]
1480 ; AVX512BW-FCP: # %bb.0:
1481 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1484 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42]
1506 ; AVX512DQ-BW: # %bb.0:
1507 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1510 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42]
1532 ; AVX512DQ-BW-FCP: # %bb.0:
1533 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1536 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42]
1557 %strided.vec0 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
1574 ; SSE: # %bb.0:
1581 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1584 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1586 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
1587 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
1590 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
1591 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1592 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1599 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1601 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0]
1602 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1603 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1604 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0]
1605 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3]
1606 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1609 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
1610 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,3,2,3]
1611 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7]
1613 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0]
1614 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1615 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7]
1619 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1620 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
1621 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
1623 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1630 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1632 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0]
1633 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1635 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1636 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[0,0]
1637 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm3[2,3]
1640 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
1642 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1643 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1644 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7]
1646 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0]
1647 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1650 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1651 ; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7]
1654 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1658 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
1660 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0]
1663 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7]
1669 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1671 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0]
1672 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1676 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1677 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535]
1680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1681 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0]
1683 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3]
1684 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7]
1685 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1686 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7]
1689 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1690 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1691 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
1692 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1693 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
1694 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
1695 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
1696 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0]
1701 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1703 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1705 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1707 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
1710 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1712 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1713 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0]
1714 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3]
1715 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7]
1716 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1717 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,2,3,4,5,6,7]
1720 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1721 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1722 ; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,2]
1723 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7]
1724 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1725 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,5,4]
1730 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1735 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1737 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
1741 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
1742 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
1745 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1746 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,7]
1747 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1756 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1758 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1759 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
1763 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,7]
1764 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1769 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1771 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1773 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1774 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[0,1,0,3]
1775 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,4,6]
1777 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3]
1778 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1779 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7]
1780 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1781 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6]
1788 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1789 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1790 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
1791 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1792 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6]
1794 ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
1795 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7]
1796 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
1797 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
1804 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1807 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,5,7]
1809 ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3]
1811 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
1812 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
1818 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1820 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
1821 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1823 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1824 ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7]
1826 ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
1829 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
1830 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
1833 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1835 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1837 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1840 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1842 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1848 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1855 ; AVX: # %bb.0:
1858 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1859 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3]
1860 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7]
1864 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1868 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1871 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1873 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1877 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,3,2,3]
1878 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,0,2,4,5,6,7]
1879 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1880 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3]
1881 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7]
1883 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5],xmm11[6,7]
1884 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm5[3,4,5,6,7]
1889 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
1893 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1894 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1895 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7]
1896 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
1897 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7]
1898 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
1903 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1904 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7]
1905 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1908 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1910 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1911 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
1913 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
1915 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1916 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7]
1917 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1920 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
1921 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1922 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1924 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1926 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
1927 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
1932 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1936 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1937 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15]
1938 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload
1939 ; AVX-NEXT: # xmm6 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7]
1941 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7]
1942 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1943 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7]
1945 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
1946 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1,2,3],xmm13[4,5],xmm4[6,7]
1949 ; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
1953 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1956 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0]
1957 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7]
1959 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7]
1964 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1968 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm9[0],xmm1[0]
1971 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
1973 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
1981 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm9[0],xmm2[0]
1983 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1988 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1992 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1993 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1994 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
1995 ; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
1996 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
1999 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2001 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2003 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
2004 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
2005 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5,4,6]
2007 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7]
2008 ; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2012 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
2014 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3]
2015 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,5,4,6]
2016 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2018 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4],xmm3[5,6,7]
2020 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
2023 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
2029 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2031 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7]
2033 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
2039 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,5,5,7]
2041 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7]
2043 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
2044 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2046 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2048 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2052 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2059 ; AVX2: # %bb.0:
2066 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
2069 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7]
2070 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
2072 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
2073 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
2075 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3]
2076 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
2077 ; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7]
2079 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1]
2080 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2082 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
2083 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
2084 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
2085 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[2,1,0,3]
2086 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
2088 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2092 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3]
2093 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7]
2094 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,0,4,5,6,7]
2095 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
2097 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7]
2098 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15]
2099 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
2100 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
2102 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2103 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2104 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2105 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7]
2106 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7]
2107 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2109 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2110 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
2112 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2114 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2115 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7]
2116 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
2117 ; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2118 ; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
2119 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2120 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4]
2121 ; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2122 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2123 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2125 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
2126 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2128 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
2129 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
2130 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2133 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2134 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2]
2135 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2136 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2137 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
2140 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2141 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2142 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2143 ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
2144 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
2146 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
2149 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
2150 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
2154 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
2155 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2156 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7]
2157 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6]
2158 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
2160 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
2162 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
2163 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
2164 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
2166 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
2172 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2178 ; AVX2-FP: # %bb.0:
2185 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
2186 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
2190 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
2192 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
2193 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
2196 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
2198 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7]
2200 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1]
2201 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2203 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
2204 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
2205 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
2206 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2207 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
2209 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2214 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
2215 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
2217 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
2218 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
2219 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
2220 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
2222 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
2223 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
2224 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
2225 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
2226 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
2228 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
2230 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2232 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
2233 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
2234 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2235 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2236 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
2237 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
2238 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
2239 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
2240 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
2241 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
2243 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
2247 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
2249 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
2250 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
2251 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
2252 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
2253 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
2256 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2257 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7]
2258 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2259 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
2260 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
2262 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
2266 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
2267 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
2272 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2274 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
2276 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
2277 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
2280 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
2282 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2283 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2289 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2295 ; AVX2-FCP: # %bb.0:
2302 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
2303 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
2307 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
2309 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
2310 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
2313 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
2315 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7]
2317 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1]
2318 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2320 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
2321 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
2322 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
2323 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2324 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
2326 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2331 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
2332 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
2334 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
2335 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
2336 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
2337 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
2339 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
2340 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
2341 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
2342 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
2343 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
2345 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
2347 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2349 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
2350 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
2351 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2352 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2353 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
2354 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
2355 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
2356 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
2357 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
2358 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
2360 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
2364 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
2366 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
2367 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
2368 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
2369 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
2370 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
2373 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2374 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7]
2375 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2376 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
2377 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
2379 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
2383 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
2384 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
2389 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2391 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
2393 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
2394 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
2397 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
2399 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2400 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2406 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2412 ; AVX512: # %bb.0:
2418 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
2421 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7]
2422 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
2426 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2428 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
2429 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
2431 ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3]
2432 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
2433 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7]
2434 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7]
2435 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15]
2436 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
2440 ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7]
2442 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2443 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
2445 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
2447 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15]
2448 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
2449 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
2451 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2452 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2453 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2454 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7]
2455 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7]
2456 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2458 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2459 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
2461 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2463 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2464 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7]
2465 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
2466 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2467 ; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
2468 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2469 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2470 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2471 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2473 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
2474 ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2476 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
2477 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
2478 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2481 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2482 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2483 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2484 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2485 ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2486 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
2489 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2490 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2491 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2493 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
2495 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
2496 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2497 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7]
2498 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6]
2499 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
2501 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7]
2504 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
2505 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
2509 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
2510 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
2511 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7]
2513 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
2519 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2525 ; AVX512-FCP: # %bb.0:
2526 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
2531 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
2534 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3]
2536 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7]
2539 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2541 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
2543 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
2544 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
2548 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
2550 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
2551 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
2555 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7]
2557 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7]
2558 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,5,5,5]
2560 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2562 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
2563 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
2564 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
2566 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2567 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2568 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2569 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
2570 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2572 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2573 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
2575 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2577 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2578 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
2579 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2580 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
2581 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2582 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2583 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2584 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2586 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2589 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2592 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2593 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2594 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2595 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2596 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2597 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2600 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2601 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2602 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2604 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
2605 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
2608 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2610 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7]
2612 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
2616 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
2619 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
2622 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
2624 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
2630 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2636 ; AVX512DQ: # %bb.0:
2642 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
2645 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7]
2646 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
2650 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2652 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
2653 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
2655 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3]
2656 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
2657 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7]
2658 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7]
2659 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15]
2660 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
2664 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7]
2666 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
2667 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
2669 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
2671 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15]
2672 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
2673 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
2675 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2676 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2677 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2678 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7]
2679 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7]
2680 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2682 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2683 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
2685 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2687 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2688 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7]
2689 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
2690 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2691 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
2692 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2693 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2694 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2695 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2697 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
2698 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2700 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
2701 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
2702 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2705 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2706 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2707 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2708 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2709 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2710 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
2713 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2714 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2715 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2717 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
2719 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
2720 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2721 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7]
2722 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6]
2723 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
2725 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7]
2728 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
2729 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
2733 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
2734 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
2735 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7]
2737 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
2743 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2749 ; AVX512DQ-FCP: # %bb.0:
2750 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
2755 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
2758 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3]
2760 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7]
2763 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
2765 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
2767 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
2768 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
2772 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
2774 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
2775 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
2779 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7]
2781 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7]
2782 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,5,5,5]
2784 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
2786 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
2787 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
2788 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
2790 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2791 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
2792 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
2793 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
2794 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
2796 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
2797 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
2799 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
2801 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
2802 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
2803 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
2804 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
2805 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
2806 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
2807 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2808 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
2810 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
2813 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
2816 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
2817 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
2818 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2819 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
2820 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2821 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2824 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
2825 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
2826 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2828 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
2829 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
2832 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2834 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7]
2836 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
2840 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
2843 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
2846 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
2848 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
2854 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2860 ; AVX512BW: # %bb.0:
2861 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2862 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2863 ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1]
2868 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
2870 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
2871 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2872 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2873 ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
2875 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
2877 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
2878 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2879 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2880 ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
2884 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
2886 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
2887 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
2888 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
2889 ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
2891 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
2893 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
2894 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
2895 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
2896 ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
2898 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
2900 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
2901 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
2902 ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
2904 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
2906 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
2917 ; AVX512BW-FCP: # %bb.0:
2918 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2919 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2920 ; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
2925 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
2927 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
2928 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2929 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2930 ; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
2932 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
2934 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
2935 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2936 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2937 ; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
2941 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
2943 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
2944 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
2945 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
2946 ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
2948 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
2950 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
2951 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
2952 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
2953 ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
2955 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
2957 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
2958 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
2959 ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
2961 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
2963 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
2974 ; AVX512DQ-BW: # %bb.0:
2975 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2976 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
2977 ; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,0,1]
2982 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
2984 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
2985 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2986 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
2987 ; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1]
2989 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
2991 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
2992 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2993 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
2994 ; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1]
2998 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
3000 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
3001 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
3002 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
3003 ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1]
3005 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
3007 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
3008 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
3009 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
3010 ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1]
3012 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
3014 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
3015 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
3016 ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1]
3018 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
3020 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
3031 ; AVX512DQ-BW-FCP: # %bb.0:
3032 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3033 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
3034 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
3039 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
3041 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
3042 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3043 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
3044 ; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
3046 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
3048 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
3049 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3050 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
3051 ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
3055 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
3057 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
3058 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
3059 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
3060 ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
3062 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
3064 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
3065 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
3066 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
3067 ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
3069 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
3071 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
3072 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
3073 ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
3075 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
3077 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7]
3087 %strided.vec0 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90>
3104 ; SSE: # %bb.0:
3105 ; SSE-NEXT: subq $456, %rsp # imm = 0x1C8
3107 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3109 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3113 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3115 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3117 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3119 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3120 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7]
3121 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
3124 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3]
3125 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3126 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3133 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3134 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3136 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
3137 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3138 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3139 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
3140 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
3141 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3144 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3145 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,3,2,3]
3146 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,0,2,4,5,6,7]
3148 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
3149 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3150 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,2,4,5,6,7]
3155 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3156 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3157 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3158 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3166 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3168 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0]
3169 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3171 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3172 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
3173 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
3174 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3177 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3179 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3180 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
3181 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7]
3183 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0]
3184 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3186 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3187 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
3191 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3193 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3194 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3195 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3196 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3204 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
3206 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0]
3209 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3210 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0]
3211 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm7[2,3]
3212 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3215 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
3217 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3218 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,3,2,3]
3219 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,0,2,4,5,6,7]
3221 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
3222 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3224 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3225 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
3230 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3231 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3]
3232 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7]
3234 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3240 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
3242 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0]
3243 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3245 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3246 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm9[0,0]
3247 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[2,3]
3248 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3251 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
3253 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3254 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,3,2,3]
3255 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7]
3257 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,0]
3258 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3259 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3262 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3263 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
3266 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3271 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
3272 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3274 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
3275 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3276 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3279 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3280 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
3284 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3288 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
3290 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
3291 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3292 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3294 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3295 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
3298 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3302 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,1,3,4,5,6,7]
3304 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0]
3305 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3307 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7]
3310 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3313 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7]
3315 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,0]
3316 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3318 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3320 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
3321 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535]
3324 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3325 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[0,0]
3326 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[2,3]
3327 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7]
3328 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3329 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,2,3,4,5,6,7]
3332 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3333 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3334 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
3335 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3336 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3337 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
3338 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,5,4]
3339 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0]
3344 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3347 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3349 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
3352 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3353 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[0,0]
3354 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3]
3355 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7]
3356 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
3357 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7]
3360 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3361 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3362 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
3363 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3364 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
3365 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
3366 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
3371 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3374 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3376 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
3379 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3380 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3381 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0]
3382 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3]
3383 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7]
3384 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
3385 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7]
3388 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3390 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
3391 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3392 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
3393 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
3394 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
3401 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3403 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm6[0]
3406 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3407 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3408 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
3409 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
3410 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
3411 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
3412 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7]
3415 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3416 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3417 ; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,2]
3418 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,6,6,7]
3419 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0]
3420 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
3425 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3429 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3431 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0]
3435 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
3436 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
3439 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3440 ; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7]
3441 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
3446 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3449 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3451 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
3455 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
3456 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7]
3459 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3460 ; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7]
3461 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
3466 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3471 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
3475 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
3476 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7]
3479 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3480 ; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7]
3481 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
3486 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3489 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3491 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
3493 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
3494 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
3498 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7]
3499 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
3504 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3505 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3507 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3509 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3510 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
3511 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,4,6]
3513 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
3514 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3515 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7]
3516 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
3517 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
3522 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3524 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3526 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3527 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3]
3528 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3529 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6]
3530 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3532 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
3533 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3534 ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7]
3535 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
3536 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
3541 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3543 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3545 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
3546 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3547 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
3548 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3549 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6]
3551 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
3552 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3553 ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7]
3554 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
3555 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm4[0,1,2,3,4,5,4,6]
3560 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
3562 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3564 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
3565 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,1,0,3]
3566 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,4,6]
3568 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
3569 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3570 ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7]
3571 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
3572 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6]
3580 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
3581 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3583 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7]
3585 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
3587 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
3588 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
3594 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3596 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
3599 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3600 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7]
3602 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
3603 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3605 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
3606 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7]
3611 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3613 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3615 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
3617 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3619 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3620 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7]
3622 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
3623 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3625 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
3626 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
3631 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3633 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3635 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
3637 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3639 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7]
3641 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
3643 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3645 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
3646 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
3649 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3651 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3653 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3655 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3657 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3659 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3661 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3663 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3665 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3669 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3671 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3673 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3675 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3677 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3679 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3685 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3690 ; SSE-NEXT: addq $456, %rsp # imm = 0x1C8
3694 ; AVX: # %bb.0:
3695 ; AVX-NEXT: subq $552, %rsp # imm = 0x228
3697 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3698 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,0,3]
3699 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7]
3701 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3704 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3708 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3709 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3712 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3715 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3717 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3719 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,3]
3720 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7]
3721 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3722 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,0,3]
3723 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7]
3725 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
3726 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
3728 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3731 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3733 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3737 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3738 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,3]
3739 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
3740 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3741 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
3742 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3747 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3752 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3754 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3756 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3757 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3]
3758 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7]
3760 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3764 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3767 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3768 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
3769 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7]
3770 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3772 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3773 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,3]
3774 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7]
3776 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3778 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4,5],xmm9[6,7]
3779 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3,4,5,6,7]
3781 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3784 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3786 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
3788 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3791 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3792 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
3793 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,0,2,4,5,6,7]
3794 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3795 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7]
3801 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3802 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,7,6,7]
3803 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3807 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3808 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
3810 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
3811 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3813 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
3814 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,5,7,6,7]
3815 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3818 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5],xmm7[6,7]
3819 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3820 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7]
3823 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
3824 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3826 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3827 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
3828 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm6[6,7]
3834 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3835 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7]
3836 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3839 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3842 ; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
3844 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
3845 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3847 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3848 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7]
3849 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3852 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
3853 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3854 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
3855 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3857 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3858 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3860 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3861 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3862 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
3867 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3868 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3871 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
3872 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15]
3873 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload
3874 ; AVX-NEXT: # xmm2 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7]
3875 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3877 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
3878 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
3879 ; AVX-NEXT: # xmm1 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7]
3880 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3882 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
3883 ; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload
3884 ; AVX-NEXT: # xmm10 = mem[0,1,2,3],xmm12[4,5],mem[6,7]
3887 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
3891 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3894 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3896 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0]
3897 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm8[4,5],xmm14[6,7]
3899 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
3900 ; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3905 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3906 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3909 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0]
3910 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload
3911 ; AVX-NEXT: # xmm6 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7]
3912 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3914 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
3915 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload
3916 ; AVX-NEXT: # xmm7 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
3919 ; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload
3920 ; AVX-NEXT: # xmm8 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
3926 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3928 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3930 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
3931 ; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload
3932 ; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
3934 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm15[3,4],xmm2[5,6,7]
3935 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3940 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3942 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3944 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm11[0],xmm2[0]
3946 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3948 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm2[3,4],xmm9[5,6,7]
3949 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3951 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
3957 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3960 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm9[0],xmm4[0]
3962 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
3967 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3969 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3971 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
3972 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3974 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
3983 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
3985 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
3990 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3991 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3993 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3995 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3996 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3997 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3998 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
3999 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4000 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
4003 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4005 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4007 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
4008 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4009 ; AVX-NEXT: # xmm0 = mem[0,1,0,3]
4010 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4011 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,6]
4012 ; AVX-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4014 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3,4,5,6,7]
4015 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4019 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4020 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
4021 ; AVX-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
4022 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4023 ; AVX-NEXT: # xmm7 = mem[0,1,0,3]
4024 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5,4,6]
4027 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
4029 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7]
4030 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4031 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4033 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4035 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
4037 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4038 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
4039 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4042 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4044 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4046 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
4047 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3]
4048 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,4,6]
4049 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4051 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3,4,5,6,7]
4055 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4056 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
4057 ; AVX-NEXT: # xmm11 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
4059 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4060 ; AVX-NEXT: # xmm9 = mem[0,1,0,3]
4061 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5,4,6]
4062 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4064 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4],xmm3[5,6,7]
4066 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5,6,7]
4067 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4070 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
4072 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4077 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
4078 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4080 ; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4081 ; AVX-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7]
4083 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7]
4087 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4089 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7]
4092 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
4094 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
4095 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4097 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4099 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
4100 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4105 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
4107 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,7]
4109 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7]
4115 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,7]
4117 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
4119 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
4120 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4122 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4124 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4126 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4128 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4130 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4132 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4134 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4137 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4139 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4142 ; AVX-NEXT: addq $552, %rsp # imm = 0x228
4147 ; AVX2: # %bb.0:
4148 ; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8
4150 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4160 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
4162 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4163 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
4164 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4165 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
4166 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
4168 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
4169 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
4173 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7]
4174 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
4176 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4177 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4179 ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4180 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
4184 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7]
4185 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4186 ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4187 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
4190 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4194 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
4195 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
4199 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4202 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7]
4204 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
4205 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
4206 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
4208 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4210 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3]
4211 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
4213 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
4215 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
4217 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4220 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7]
4221 ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4222 ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4225 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7]
4226 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
4229 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
4230 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4231 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4233 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4235 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
4238 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7]
4240 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4242 ; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
4245 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,0,3]
4246 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
4247 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
4248 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4249 ; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
4250 ; AVX2-NEXT: # ymm10 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7]
4252 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7]
4254 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
4255 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
4256 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4257 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,1,0,3]
4260 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
4261 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6],xmm6[7]
4264 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7]
4265 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
4266 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7]
4269 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
4270 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3],xmm8[4,5],xmm1[6],xmm8[7]
4272 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
4273 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4274 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4276 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,5,5,5]
4277 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
4279 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,1,0,3]
4282 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
4283 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6],xmm3[7]
4284 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
4285 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
4286 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
4288 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
4289 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4290 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4291 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4292 ; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
4293 ; AVX2-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
4294 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
4296 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1]
4297 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3]
4298 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7]
4299 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
4300 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4]
4301 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
4302 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4303 ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4304 ; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
4307 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
4308 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7]
4309 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
4310 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7]
4311 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
4313 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
4315 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
4316 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
4317 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
4318 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4319 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4320 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
4321 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
4323 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1]
4324 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
4325 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7]
4326 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
4327 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4]
4328 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
4329 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4330 ; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
4331 ; AVX2-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
4332 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4334 ; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
4337 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
4338 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7]
4339 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
4340 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7]
4341 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7]
4344 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
4345 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4]
4346 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7]
4347 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
4348 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
4350 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
4351 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7]
4353 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
4354 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
4355 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7]
4359 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15]
4360 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
4361 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7]
4362 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
4363 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
4365 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
4366 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
4369 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
4370 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4371 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
4373 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
4374 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
4375 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
4376 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4377 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
4379 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4380 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7]
4381 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
4382 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
4384 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
4386 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
4387 ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
4388 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4389 ; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
4390 ; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
4393 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
4394 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7]
4395 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
4396 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7]
4398 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4399 ; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
4400 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
4402 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4403 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
4404 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
4406 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4407 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
4409 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
4410 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
4411 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
4413 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4414 ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
4415 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4417 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4419 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4421 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4423 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4430 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
4433 ; AVX2-NEXT: addq $488, %rsp # imm = 0x1E8
4438 ; AVX2-FP: # %bb.0:
4439 ; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8
4441 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4443 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4451 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
4453 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4454 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
4456 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
4457 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
4459 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
4460 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
4464 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
4465 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
4467 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4468 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4469 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4470 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
4474 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
4475 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4476 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
4479 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4483 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
4487 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4491 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
4492 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
4493 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
4495 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4498 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
4500 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
4502 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
4504 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4506 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4508 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4509 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
4510 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
4514 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
4516 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
4517 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
4518 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4520 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4522 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
4526 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
4527 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4528 ; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
4529 ; AVX2-FP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
4531 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
4534 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7]
4535 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4537 ; AVX2-FP-NEXT: # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7]
4539 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7]
4541 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
4542 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4543 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4547 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7]
4550 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
4552 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7]
4555 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
4556 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
4558 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15]
4559 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
4560 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4562 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
4563 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
4567 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7]
4569 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
4571 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
4572 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4573 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4574 ; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
4575 ; AVX2-FP-NEXT: # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7]
4576 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4577 ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4578 ; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
4579 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
4581 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
4582 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
4584 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
4585 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
4586 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4587 ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4588 ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
4590 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1]
4592 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
4594 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7]
4595 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7]
4597 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
4599 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15]
4600 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
4601 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7]
4602 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7]
4603 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4604 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4605 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
4606 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3]
4609 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
4610 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4]
4611 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
4612 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4614 ; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
4615 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4616 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4617 ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
4619 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
4622 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
4623 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
4626 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
4627 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
4628 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
4629 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7]
4632 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
4633 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
4634 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
4637 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
4641 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
4642 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
4643 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
4644 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4646 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
4647 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
4651 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
4653 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
4654 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
4655 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
4656 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4657 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
4659 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4660 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
4663 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
4665 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4666 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
4667 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4668 ; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
4669 ; AVX2-FP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
4671 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
4674 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7]
4676 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
4677 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
4678 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
4681 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
4683 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4684 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
4687 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
4689 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4690 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
4691 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4693 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4695 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4697 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4699 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4706 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4709 ; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8
4714 ; AVX2-FCP: # %bb.0:
4715 ; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8
4717 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4719 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4727 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
4729 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4730 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
4732 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
4733 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
4735 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
4736 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
4740 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
4741 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
4743 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4744 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4745 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4746 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
4750 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
4751 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4752 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
4755 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4759 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
4763 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4767 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
4768 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
4769 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
4771 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4774 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
4776 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
4778 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
4780 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
4782 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4784 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4785 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
4786 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
4790 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
4792 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
4793 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
4794 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4796 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4798 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
4802 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
4803 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4804 ; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
4805 ; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
4807 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
4810 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7]
4811 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4813 ; AVX2-FCP-NEXT: # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7]
4815 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7]
4817 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
4818 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4819 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4823 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7]
4826 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
4828 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7]
4831 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
4832 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
4834 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15]
4835 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
4836 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4838 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
4839 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
4843 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7]
4845 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
4847 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
4848 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4849 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4850 ; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
4851 ; AVX2-FCP-NEXT: # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7]
4852 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4853 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4854 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
4855 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
4857 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
4858 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
4860 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
4861 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
4862 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4863 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4864 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
4866 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1]
4868 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
4870 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7]
4871 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7]
4873 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
4875 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15]
4876 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
4877 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7]
4878 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7]
4879 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4880 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4881 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
4882 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3]
4885 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
4886 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4]
4887 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
4888 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4890 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
4891 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4892 ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4893 ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
4895 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
4898 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
4899 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
4902 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
4903 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
4904 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
4905 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7]
4908 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
4909 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
4910 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
4913 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
4917 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
4918 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
4919 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
4920 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4922 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
4923 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
4927 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
4929 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
4930 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
4931 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
4932 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4933 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
4935 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4936 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
4939 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
4941 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4942 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
4943 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4944 ; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
4945 ; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
4947 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
4950 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7]
4952 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
4953 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
4954 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
4957 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
4959 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4960 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
4963 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
4965 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4966 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
4967 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4969 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4971 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4973 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4975 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4982 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4985 ; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8
4990 ; AVX512: # %bb.0:
4992 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
4995 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7]
4998 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3]
4999 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
5000 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
5006 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7]
5011 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7]
5012 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
5016 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5018 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
5022 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3]
5023 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
5024 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7]
5026 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7]
5030 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7]
5034 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
5039 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7]
5045 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm12[1],ymm15[2,3,4,5],ymm12[6],ymm15[7]
5048 ; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
5049 ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
5050 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6]
5051 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
5056 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
5059 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5060 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7]
5063 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5066 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
5068 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7]
5070 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5]
5071 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
5074 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
5075 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
5076 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5078 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
5083 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1]
5084 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,3,4,5,6,7]
5085 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
5086 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7]
5087 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5090 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
5092 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3]
5093 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7]
5094 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
5095 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1]
5096 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,4]
5097 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
5102 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
5106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1]
5107 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7]
5108 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
5109 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
5110 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5113 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7]
5114 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
5115 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
5116 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5121 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
5123 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
5124 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
5125 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
5126 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
5127 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
5128 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
5132 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
5133 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
5134 ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
5135 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
5136 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
5138 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5140 ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
5143 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7]
5144 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
5145 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7]
5146 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5]
5148 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
5149 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
5155 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
5156 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
5157 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
5159 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
5160 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
5161 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
5163 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
5164 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
5167 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
5168 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
5169 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
5173 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
5176 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
5180 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
5183 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
5185 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1]
5186 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
5187 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
5188 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
5190 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7]
5193 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7]
5196 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
5200 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7]
5201 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5202 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
5209 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7]
5212 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1]
5213 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7]
5214 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
5215 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7]
5219 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
5220 ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
5221 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7]
5226 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5227 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
5228 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
5230 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
5231 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
5232 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
5237 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
5238 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
5243 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7]
5244 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3]
5245 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7]
5248 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
5250 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5252 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
5254 ; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800
5258 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload
5262 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
5274 ; AVX512-FCP: # %bb.0:
5276 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
5279 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7]
5282 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,1,0,3]
5284 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
5290 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7]
5292 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
5296 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
5299 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5300 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7]
5305 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3]
5307 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4,5],xmm8[6,7]
5310 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm0[1],ymm8[2,3,4,5],ymm0[6],ymm8[7]
5314 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3,4,5,6,7]
5315 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5318 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
5324 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7]
5329 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7]
5332 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
5333 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
5334 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
5335 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
5340 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
5343 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
5344 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
5347 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5350 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
5352 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5355 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5356 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
5359 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
5360 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
5361 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5363 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7]
5368 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7]
5369 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1]
5370 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
5371 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
5373 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
5376 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
5377 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
5379 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
5380 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
5381 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
5387 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
5390 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7]
5391 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
5392 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
5393 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
5396 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7]
5397 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
5398 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
5399 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5403 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7]
5405 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
5407 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1]
5408 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4]
5409 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
5413 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
5414 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
5415 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15]
5416 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
5417 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
5419 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5421 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
5425 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7]
5428 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
5429 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7]
5434 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7]
5438 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
5439 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5441 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
5442 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
5445 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
5446 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
5447 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5451 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
5454 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
5458 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
5461 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
5463 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
5464 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
5467 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
5470 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
5473 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
5477 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7]
5478 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5479 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
5485 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7]
5488 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
5490 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm13[4],xmm8[5],xmm13[6,7]
5494 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
5495 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
5496 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7]
5501 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7]
5506 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
5507 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
5510 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
5516 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7]
5519 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
5521 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5522 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5523 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
5525 ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
5530 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
5534 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5546 ; AVX512DQ: # %bb.0:
5548 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
5551 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7]
5555 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
5556 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
5557 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
5563 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
5568 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3,4],xmm15[5,6,7]
5569 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
5574 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
5579 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,0,3]
5580 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
5581 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7]
5583 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
5586 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm5[3,4,5,6,7]
5589 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
5594 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
5600 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7]
5603 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
5604 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
5605 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
5606 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
5607 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5609 ; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800
5612 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5616 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3],xmm8[4,5],xmm3[6,7]
5619 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
5620 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3],xmm8[4,5],xmm14[6],xmm8[7]
5625 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3],xmm6[4,5],xmm0[6,7]
5627 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5629 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5630 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
5634 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
5635 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
5636 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5638 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5640 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7]
5645 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
5646 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,3,4,5,6,7]
5647 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
5648 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
5649 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5652 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
5654 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3]
5655 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7]
5656 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
5657 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
5658 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
5659 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
5665 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
5668 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
5669 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7]
5670 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
5671 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
5672 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5674 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7]
5675 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
5676 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
5677 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5680 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
5682 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
5683 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
5684 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
5685 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
5686 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,6,5,6,4]
5687 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7]
5691 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7]
5692 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
5693 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
5694 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
5695 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
5697 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5699 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
5702 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
5703 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7]
5704 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
5705 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
5707 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
5708 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7]
5712 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
5713 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
5714 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7]
5716 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7]
5717 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
5718 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
5720 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
5721 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7]
5724 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
5725 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
5726 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5730 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
5733 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
5737 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
5740 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
5742 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,3,2,1]
5743 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
5744 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
5745 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
5747 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7]
5750 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
5752 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7]
5756 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
5757 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5758 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
5762 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1}
5765 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
5768 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,3,2,1]
5769 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7]
5770 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
5771 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6,7]
5775 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7]
5776 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
5777 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7]
5782 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5783 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
5784 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
5786 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
5787 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
5788 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7]
5793 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
5794 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7]
5797 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1}
5799 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
5800 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
5801 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
5804 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
5806 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5808 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5810 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
5822 ; AVX512DQ-FCP: # %bb.0:
5824 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
5827 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7]
5831 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
5833 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
5839 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
5842 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
5846 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4,5],xmm4[6],xmm8[7]
5849 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7]
5854 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
5856 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7]
5859 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
5862 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm4[3,4,5,6,7]
5865 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
5871 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7]
5876 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7]
5879 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
5880 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
5881 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
5882 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
5883 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5885 ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
5888 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5892 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3],xmm8[4,5],xmm5[6,7]
5895 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,5,5,5,5]
5896 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7]
5901 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6,7]
5903 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5905 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
5906 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
5910 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
5911 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
5912 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5914 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5916 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7]
5921 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
5922 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
5923 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
5924 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
5927 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
5929 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
5930 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
5932 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
5933 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
5934 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
5940 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7]
5943 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7]
5944 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
5945 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
5946 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
5948 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7]
5949 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
5950 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
5951 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm6[4,5,6,7]
5954 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
5956 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3]
5958 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[0,1,2,1]
5959 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4]
5960 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7]
5964 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7]
5965 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
5966 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
5967 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
5968 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
5970 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
5972 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
5976 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
5979 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
5980 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5,6],xmm7[7]
5985 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7]
5987 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
5988 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5990 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
5991 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7]
5994 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
5995 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
5996 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6000 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
6003 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
6007 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
6010 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
6012 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1]
6013 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
6016 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7]
6019 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
6021 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
6025 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
6026 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6027 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6031 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1}
6034 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7]
6037 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
6039 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7]
6043 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4,5],ymm10[6],ymm14[7]
6044 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
6045 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7]
6050 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7]
6055 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7]
6056 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
6059 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
6062 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm1 {%k1}
6065 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
6068 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
6070 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6072 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6074 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6086 ; AVX512BW: # %bb.0:
6087 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6094 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
6095 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
6097 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
6098 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6100 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
6102 ; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800
6105 ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800
6108 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
6109 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6111 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
6112 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6114 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
6118 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
6119 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6121 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
6122 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6124 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
6126 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800
6129 ; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
6132 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
6133 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6135 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
6136 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6138 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
6142 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
6144 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
6145 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6150 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
6151 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6154 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
6156 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
6157 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
6160 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
6161 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
6174 ; AVX512BW-FCP: # %bb.0:
6175 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6182 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
6183 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
6185 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
6186 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6188 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
6190 ; AVX512BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800
6193 ; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800
6196 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
6197 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6199 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
6200 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6202 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
6206 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
6207 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6209 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
6210 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6212 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
6214 ; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800
6217 ; AVX512BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
6220 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
6221 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6223 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
6224 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6226 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
6230 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
6232 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
6233 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6238 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
6239 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6242 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
6244 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
6245 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
6248 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
6249 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
6262 ; AVX512DQ-BW: # %bb.0:
6263 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6270 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
6271 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
6273 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
6274 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6276 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
6278 ; AVX512DQ-BW-NEXT: movl $4192256, %edi # imm = 0x3FF800
6281 ; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800
6284 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
6285 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6287 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
6288 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6290 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
6294 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
6295 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6297 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
6298 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6300 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
6302 ; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800
6305 ; AVX512DQ-BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
6308 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
6309 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6311 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
6312 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6314 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
6318 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
6320 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
6321 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6326 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
6327 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6330 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
6332 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
6333 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
6336 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
6337 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
6350 ; AVX512DQ-BW-FCP: # %bb.0:
6351 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6358 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
6359 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
6361 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
6362 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6364 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
6366 ; AVX512DQ-BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800
6369 ; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800
6372 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
6373 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6375 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
6376 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6378 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
6382 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
6383 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6385 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
6386 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6388 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
6390 ; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800
6393 ; AVX512DQ-BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
6396 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
6397 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6399 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
6400 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6402 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
6406 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
6408 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
6409 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
6414 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
6415 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6418 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
6420 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
6421 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
6424 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
6425 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
6437 %strided.vec0 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186>
6454 ; SSE: # %bb.0:
6455 ; SSE-NEXT: subq $1176, %rsp # imm = 0x498
6457 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6459 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6461 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6465 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6467 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6469 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6470 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7]
6471 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535]
6474 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3]
6475 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6476 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
6483 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
6484 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6486 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
6487 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6488 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6489 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
6490 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
6491 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6494 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6495 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,3]
6496 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6497 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6499 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
6500 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6501 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,2,4,5,6,7]
6505 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6506 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6507 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6508 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6516 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6517 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6519 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0]
6520 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6522 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6523 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
6524 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
6525 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6528 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6530 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6531 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
6532 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6533 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
6535 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
6536 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6538 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6539 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6543 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6545 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6546 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6547 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6549 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6556 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6557 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6559 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
6560 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6563 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
6564 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
6565 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6568 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6570 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6571 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6572 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6573 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6575 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
6576 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6578 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6579 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6583 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6585 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6586 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6587 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6588 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6596 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
6598 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
6599 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6601 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6602 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
6603 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
6604 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6607 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6609 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6610 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6611 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6612 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6614 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
6615 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6617 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6618 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6622 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6624 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6625 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6626 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6627 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6635 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
6637 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
6638 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6640 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6641 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
6642 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
6643 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6646 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6648 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6649 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6650 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6651 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6653 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
6654 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6656 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6657 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6661 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6663 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6664 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6665 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6666 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6674 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
6676 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0]
6677 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6679 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6680 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
6681 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
6682 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6685 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6687 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6688 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
6689 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6690 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
6692 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
6693 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6695 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6696 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6700 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6702 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6703 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
6704 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6705 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6713 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
6715 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0]
6716 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6718 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6719 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0]
6720 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3]
6721 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6724 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
6726 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6727 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6728 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6729 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6731 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
6732 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6734 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6735 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
6740 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6741 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,1,0,3]
6742 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7]
6744 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6751 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
6753 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0]
6754 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6756 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6757 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0]
6758 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3]
6759 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6762 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6764 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6765 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3]
6766 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,0,2,4,5,6,7]
6768 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
6769 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6770 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6773 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6774 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
6777 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6781 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6782 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
6783 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6785 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0]
6786 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6787 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6789 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6790 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
6793 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6796 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6797 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
6798 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6800 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
6801 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6804 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6805 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
6808 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6812 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6813 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
6814 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6816 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
6817 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6818 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6820 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6821 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
6824 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6827 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6828 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
6830 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0]
6831 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6832 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6835 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6836 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
6839 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6843 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6844 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
6846 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0]
6847 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6848 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6850 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6851 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
6854 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6857 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6858 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
6860 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0]
6861 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6862 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6865 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6866 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
6869 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6873 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6874 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
6876 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
6877 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6879 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7]
6882 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
6884 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7]
6886 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0]
6887 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6890 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6892 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6893 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535]
6896 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6897 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0]
6898 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3]
6899 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
6900 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
6901 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7]
6904 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6905 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6906 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
6907 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6908 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
6909 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
6910 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4]
6911 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0]
6916 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6918 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6920 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
6923 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6924 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6925 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0]
6926 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3]
6927 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7]
6928 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6929 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
6933 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6934 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
6936 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
6937 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
6938 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
6943 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6946 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
6948 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
6951 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6952 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0]
6953 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3]
6954 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
6955 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6956 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
6959 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6960 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6961 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
6962 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6963 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
6964 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
6965 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
6970 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6973 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
6975 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
6978 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6979 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm15[0,0]
6980 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm15[2,3]
6981 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7]
6982 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6983 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
6986 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6987 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6988 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
6989 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6990 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
6991 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
6992 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
6997 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6998 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7000 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
7002 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
7005 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7006 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7007 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
7008 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
7009 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7]
7010 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
7011 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7]
7014 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7015 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7016 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
7017 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7018 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
7019 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
7020 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
7025 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7026 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7028 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7030 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
7033 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7034 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7035 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0]
7036 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,3]
7037 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7]
7038 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
7039 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7]
7042 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7043 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7044 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
7045 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7046 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7]
7047 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0]
7048 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
7053 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7054 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7056 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7058 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
7061 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7062 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7063 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0]
7064 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3]
7065 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
7066 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
7067 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7]
7070 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7071 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7072 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
7073 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7074 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7]
7075 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0]
7076 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
7081 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7082 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7085 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7087 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
7090 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7091 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7092 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0]
7093 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3]
7094 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7]
7095 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
7096 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7]
7099 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7100 ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
7101 ; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,2]
7102 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7103 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
7104 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0]
7105 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
7110 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7112 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
7114 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm9[0]
7118 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
7119 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
7123 ; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7]
7124 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
7130 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7133 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
7135 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0]
7139 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7140 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
7143 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7144 ; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7]
7145 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
7150 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7151 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7154 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7156 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0]
7160 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
7161 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
7164 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7165 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7]
7166 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7171 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7172 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7175 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7177 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0]
7181 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
7182 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
7185 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7186 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7]
7187 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7192 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7193 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7196 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7198 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
7202 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
7203 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
7206 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7207 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7]
7208 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7213 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7214 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7217 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7219 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
7223 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
7224 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
7227 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7228 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7]
7229 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7234 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7237 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
7241 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
7242 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
7245 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7246 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7]
7247 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7252 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7253 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7256 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7258 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
7260 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7261 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
7265 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7266 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7]
7267 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
7272 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7273 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7275 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7277 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7278 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
7279 ; SSE-NEXT: # xmm11 = mem[0,1,0,3]
7280 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,4,6]
7281 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7283 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
7284 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7285 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,2,2,3,4,5,6,7]
7286 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
7287 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7292 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7293 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7295 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7297 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7298 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7299 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
7300 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6]
7302 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
7303 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7304 ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7]
7305 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
7306 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7311 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7312 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7314 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7316 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7317 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7318 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
7319 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7320 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7322 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7323 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7324 ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7]
7325 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7326 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7331 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7332 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7334 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7336 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7337 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7338 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
7339 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7340 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7342 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7343 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7344 ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7]
7345 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7346 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7351 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7352 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7354 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7356 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7357 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7358 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
7359 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7360 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7362 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7363 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7364 ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7]
7365 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7366 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7371 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7373 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7375 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7376 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7377 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
7378 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7379 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7381 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7382 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7383 ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7]
7384 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7385 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7390 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7392 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7394 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7395 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7396 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
7397 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7398 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7399 ; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7401 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7402 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7403 ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7]
7404 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7405 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7410 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7412 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7414 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7415 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,0,3]
7416 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7417 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
7419 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
7420 ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7421 ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7]
7422 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
7423 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6]
7430 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7433 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,7]
7435 ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
7437 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
7438 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
7443 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7445 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7447 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7449 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7451 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
7453 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
7454 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7456 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7457 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7462 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7464 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7466 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
7467 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7469 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7470 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
7472 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
7473 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7475 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7476 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7481 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7483 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7485 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7487 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7489 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7490 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
7492 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
7493 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7495 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7496 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7501 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7503 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7505 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7507 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7509 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7510 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
7512 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
7513 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7515 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7516 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7521 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7523 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7525 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7527 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7529 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7530 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
7532 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
7533 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7535 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7536 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7541 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7543 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7545 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
7546 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7548 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7549 ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7]
7551 ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
7552 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7554 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
7555 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
7560 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7562 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7564 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7566 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7568 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7569 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
7571 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
7573 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7575 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
7576 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
7579 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7581 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7583 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7585 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7587 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7589 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7591 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7593 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7595 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7597 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7599 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7601 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7603 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7605 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7607 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7609 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7611 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7613 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7615 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7617 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7619 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7621 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7623 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7625 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7627 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7629 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7631 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7633 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7635 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7637 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7639 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7647 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7649 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7651 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7653 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7655 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
7664 ; SSE-NEXT: addq $1176, %rsp # imm = 0x498
7668 ; AVX: # %bb.0:
7669 ; AVX-NEXT: subq $1368, %rsp # imm = 0x558
7671 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7672 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7673 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7674 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7676 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7679 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7682 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7684 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7687 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7689 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7692 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7694 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7695 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
7696 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7697 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
7698 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
7699 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
7700 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7701 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
7703 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
7704 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7]
7706 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7709 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7711 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7713 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7716 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7717 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
7718 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7719 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
7720 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7721 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7]
7722 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
7727 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7732 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7734 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7736 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7737 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7738 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7739 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
7741 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7745 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7748 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7749 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,3]
7750 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,0,2,4,5,6,7]
7751 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7753 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7754 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,3]
7755 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7]
7757 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7759 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
7760 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
7762 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7765 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7767 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7769 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7772 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7773 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,3,2,3]
7774 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7]
7775 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
7776 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7]
7781 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7783 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7786 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7788 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7790 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7791 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7792 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7793 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
7795 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7799 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7802 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7803 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,3]
7804 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7]
7805 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7807 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7808 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3]
7809 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7]
7811 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7813 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
7814 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
7816 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7819 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7821 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7824 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7826 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7827 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3]
7828 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7]
7829 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
7830 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7]
7835 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7837 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7840 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7842 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7844 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7845 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7846 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7847 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
7849 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7853 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7856 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7857 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
7858 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,1,0,2,4,5,6,7]
7859 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1]
7861 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7862 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
7863 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7]
7865 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7867 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4,5],xmm0[6,7]
7868 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3,4,5,6,7]
7870 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7873 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7875 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
7877 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7880 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7881 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7882 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,1,0,2,4,5,6,7]
7883 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
7884 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm15[6,7]
7889 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7890 ; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7891 ; AVX-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
7892 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7895 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7897 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
7898 ; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7900 ; AVX-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7901 ; AVX-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7]
7903 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
7904 ; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7905 ; AVX-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7]
7906 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7909 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4,5],xmm14[6,7]
7910 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7911 ; AVX-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7912 ; AVX-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7]
7913 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7915 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
7916 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7918 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
7919 ; AVX-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3]
7920 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm14[6,7]
7925 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7926 ; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7927 ; AVX-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
7928 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7931 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7934 ; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7936 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7]
7937 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
7939 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
7940 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7]
7941 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7944 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5],xmm12[6,7]
7945 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7946 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7]
7947 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
7949 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
7950 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
7952 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
7953 ; AVX-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
7954 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm11[6,7]
7960 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7961 ; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7962 ; AVX-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
7963 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7966 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7968 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
7969 ; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7971 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
7972 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
7974 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
7975 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
7976 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7979 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4,5],xmm8[6,7]
7980 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7981 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
7983 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
7984 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
7986 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
7987 ; AVX-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
7988 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7]
7993 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7994 ; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7995 ; AVX-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
7996 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7999 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8001 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8002 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
8004 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
8005 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8007 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
8008 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7]
8009 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8012 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
8013 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
8014 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
8015 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8017 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8018 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8020 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8021 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
8022 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
8027 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8028 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8030 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8032 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
8033 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15]
8034 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8035 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8036 ; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
8037 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8039 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
8040 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8041 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8042 ; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
8043 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8045 ; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
8046 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8047 ; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
8048 ; AVX-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7]
8049 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8052 ; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
8056 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8058 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8060 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
8061 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8062 ; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
8063 ; AVX-NEXT: # xmm3 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
8064 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8066 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
8071 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8072 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8074 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8076 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
8077 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8078 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
8079 ; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
8080 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8082 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
8083 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8084 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload
8085 ; AVX-NEXT: # xmm0 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
8086 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8088 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8090 ; AVX-NEXT: # xmm0 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
8091 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8097 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8099 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8101 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
8102 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8103 ; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
8104 ; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
8105 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8107 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
8112 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8113 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8116 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
8117 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload
8118 ; AVX-NEXT: # xmm2 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7]
8119 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8121 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
8122 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload
8123 ; AVX-NEXT: # xmm2 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
8124 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8126 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7]
8127 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8133 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8136 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
8137 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7]
8138 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8140 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
8145 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8146 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8148 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8150 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
8151 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8152 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8153 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
8154 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8156 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
8157 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8158 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
8159 ; AVX-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
8161 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8162 ; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload
8163 ; AVX-NEXT: # xmm15 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
8169 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8171 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8173 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
8174 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8175 ; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
8176 ; AVX-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7]
8178 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7]
8184 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8185 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8187 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8189 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
8191 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8193 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7]
8194 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8196 ; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
8197 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8203 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8205 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8207 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0]
8208 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8210 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
8215 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8216 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8218 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8220 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
8221 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8223 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7]
8224 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8226 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8232 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8234 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8236 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0]
8237 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8239 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
8244 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8245 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8248 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
8249 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8251 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7]
8259 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8261 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
8263 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
8268 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8269 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8271 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8273 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
8274 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8276 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
8277 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8279 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8285 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8287 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8289 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
8290 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8292 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
8297 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8298 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8300 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8302 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8303 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8305 ; AVX-NEXT: # xmm2 = mem[0,1],xmm1[2,3],mem[4,5,6,7]
8307 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
8310 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8312 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8314 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8315 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8316 ; AVX-NEXT: # xmm3 = mem[0,1,0,3]
8317 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8318 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
8320 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
8321 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
8325 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8326 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload
8327 ; AVX-NEXT: # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
8328 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8329 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,3]
8330 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8331 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
8334 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
8336 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
8337 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8338 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8340 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8342 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8343 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8344 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload
8345 ; AVX-NEXT: # xmm13 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
8348 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8350 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8352 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8353 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8354 ; AVX-NEXT: # xmm3 = mem[0,1,0,3]
8355 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,4,6]
8356 ; AVX-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
8358 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7]
8362 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8363 ; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload
8364 ; AVX-NEXT: # xmm9 = mem[0,1],xmm2[2,3],mem[4,5,6,7]
8365 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8366 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8367 ; AVX-NEXT: # xmm6 = mem[0,1,0,3]
8368 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6]
8369 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8372 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7]
8374 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7]
8375 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8376 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8378 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8380 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
8381 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8382 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
8383 ; AVX-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
8384 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8387 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8389 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
8391 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
8392 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8393 ; AVX-NEXT: # xmm9 = mem[0,1,0,3]
8394 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8395 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,4,6]
8397 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7]
8401 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8402 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload
8403 ; AVX-NEXT: # xmm8 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
8404 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8405 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,1,0,3]
8406 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8407 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6]
8408 ; AVX-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
8411 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7]
8413 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7]
8414 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8415 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8417 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8419 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
8420 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8421 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
8422 ; AVX-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
8423 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8426 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8428 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8430 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3]
8431 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8432 ; AVX-NEXT: # xmm15 = mem[0,1,0,3]
8433 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,5,4,6]
8434 ; AVX-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
8436 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7]
8440 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8441 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
8442 ; AVX-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
8444 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8445 ; AVX-NEXT: # xmm14 = mem[0,1,0,3]
8446 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,4,6]
8447 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8449 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
8451 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
8452 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8453 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8456 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
8462 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8463 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8465 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7]
8467 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7]
8472 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7]
8474 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8476 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
8478 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5,6,7]
8479 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8481 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8483 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
8487 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8489 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8491 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
8492 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8494 ; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8495 ; AVX-NEXT: # xmm8 = mem[0,1,2,3,4,5,5,7]
8497 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3,4,5,6,7]
8501 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8503 ; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8504 ; AVX-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7]
8506 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8508 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
8510 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
8511 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8513 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8515 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
8516 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8519 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8521 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8523 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
8524 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8526 ; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8527 ; AVX-NEXT: # xmm9 = mem[0,1,2,3,4,5,5,7]
8529 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3,4,5,6,7]
8533 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8535 ; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8536 ; AVX-NEXT: # xmm8 = mem[0,1,2,3,4,5,5,7]
8538 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8540 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
8542 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7]
8543 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8546 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
8547 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8550 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8552 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8554 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8555 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8557 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,5,5,7]
8559 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7]
8565 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7]
8567 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
8569 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7]
8570 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8572 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8574 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8576 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8578 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8580 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8582 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8584 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8586 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8588 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8590 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8592 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8594 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8596 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8598 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8600 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8602 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8604 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8606 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8608 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8610 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
8615 ; AVX-NEXT: addq $1368, %rsp # imm = 0x558
8620 ; AVX2: # %bb.0:
8621 ; AVX2-NEXT: subq $1272, %rsp # imm = 0x4F8
8629 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8631 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8635 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8636 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
8637 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8639 ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8640 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
8642 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8643 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
8644 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8646 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8647 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
8648 ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8649 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
8650 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
8651 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
8655 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
8656 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
8658 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
8660 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8662 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8664 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8665 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
8669 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7]
8670 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
8673 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8675 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8677 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8678 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
8682 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7]
8683 ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8684 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
8687 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8689 ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8691 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8692 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
8696 ; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
8697 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8698 ; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8699 ; AVX2-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
8702 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8706 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
8707 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
8711 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8714 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
8715 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
8718 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8721 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
8722 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
8725 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8729 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
8730 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
8732 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8733 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8734 ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
8735 ; AVX2-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
8737 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8738 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
8739 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
8740 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
8742 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
8744 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8745 ; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8746 ; AVX2-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
8747 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8749 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
8751 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8753 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8754 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
8757 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8758 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
8759 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
8762 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
8763 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
8764 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8765 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8766 ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
8767 ; AVX2-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
8769 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8770 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
8771 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
8773 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
8774 ; AVX2-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
8775 ; AVX2-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
8776 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8778 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
8780 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8782 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8783 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
8786 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8787 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
8790 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
8791 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
8792 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8793 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8794 ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
8795 ; AVX2-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
8797 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
8798 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
8800 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
8801 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8802 ; AVX2-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
8803 ; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
8804 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8806 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
8808 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8810 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8811 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4],ymm8[5],ymm2[6,7]
8814 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3,4],xmm12[5,6,7]
8817 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
8818 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
8819 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8821 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8823 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8824 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
8827 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3,4],xmm11[5,6,7]
8829 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8830 ; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8831 ; AVX2-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
8834 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,2,0,3]
8835 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7]
8836 ; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm15[2],xmm8[3],xmm15[4,5],xmm8[6,7]
8837 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
8838 ; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
8839 ; AVX2-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7]
8841 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
8843 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7],ymm13[8,9,10],ymm9[11,12,13,14,15]
8844 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
8845 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8846 ; AVX2-NEXT: vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8847 ; AVX2-NEXT: # xmm9 = mem[2,1,0,3]
8850 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
8851 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm9[2],xmm5[3,4],xmm9[5],xmm5[6],xmm9[7]
8853 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8855 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
8856 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
8857 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7]
8859 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8861 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
8862 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3],xmm15[4,5],xmm7[6],xmm15[7]
8864 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3,4,5,6,7],ymm13[8,9,10],ymm7[11,12,13,14,15]
8865 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
8866 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8867 ; AVX2-NEXT: vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8868 ; AVX2-NEXT: # xmm7 = mem[2,1,0,3]
8871 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
8872 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6],xmm7[7]
8873 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
8875 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7]
8876 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
8877 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
8878 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8880 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
8881 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
8883 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
8884 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
8885 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8886 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[2,1,0,3]
8889 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
8890 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6],xmm4[7]
8891 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8893 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7]
8894 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
8895 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
8897 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
8898 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
8900 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
8901 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
8902 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8904 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
8905 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
8907 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,1,0,3]
8910 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
8911 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6],xmm3[7]
8912 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7]
8913 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
8914 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
8916 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
8917 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
8918 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8919 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8920 ; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
8921 ; AVX2-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
8922 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8923 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8924 ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8925 ; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
8927 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
8928 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8929 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
8930 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8931 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
8932 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
8933 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4]
8934 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
8935 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8936 ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8937 ; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
8940 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
8941 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,3,4,5,6,7]
8942 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
8943 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
8944 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
8946 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
8948 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
8949 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
8950 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
8951 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
8952 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8953 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8954 ; AVX2-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
8955 ; AVX2-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
8956 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8957 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8958 ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8959 ; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
8961 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
8962 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8963 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
8964 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8965 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
8966 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
8967 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4]
8968 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
8969 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8970 ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
8971 ; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
8973 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8975 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
8976 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8977 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
8978 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
8979 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7]
8980 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
8983 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
8984 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
8985 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
8986 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
8987 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8988 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8989 ; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
8990 ; AVX2-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
8991 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8992 ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8993 ; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
8995 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,1]
8996 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
8997 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
8998 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
8999 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,4]
9000 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
9001 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9002 ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9003 ; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
9006 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1]
9007 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7]
9008 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
9009 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7]
9010 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7]
9013 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15]
9014 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4]
9015 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
9016 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7]
9017 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9018 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9019 ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
9020 ; AVX2-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9022 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
9023 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3]
9024 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,0,0,0,4,5,6,7]
9025 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
9026 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,6,4]
9027 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7]
9028 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9029 ; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
9030 ; AVX2-NEXT: # ymm15 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7]
9031 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9032 ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
9033 ; AVX2-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
9036 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
9037 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7]
9038 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3]
9039 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[2,1,2,0,4,5,6,7]
9040 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7]
9043 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
9044 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4]
9045 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
9046 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
9047 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9048 ; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9049 ; AVX2-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5]
9050 ; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9052 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
9053 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7]
9055 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
9056 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
9057 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
9059 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
9062 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
9063 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
9064 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
9065 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
9066 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9067 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,5,6,5]
9069 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
9070 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7]
9072 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
9073 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
9074 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
9077 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
9078 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
9079 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9080 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9081 ; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9082 ; AVX2-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5]
9083 ; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9085 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
9086 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
9087 ; AVX2-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9089 ; AVX2-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9090 ; AVX2-NEXT: # xmm2 = mem[0,1,3,3,4,5,6,7]
9091 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
9092 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
9093 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9096 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
9097 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
9098 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9099 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9100 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,5,6,5]
9102 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
9103 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
9106 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,3,4,5,6,7]
9107 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
9108 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
9110 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
9111 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
9112 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
9113 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9114 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9115 ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9116 ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9118 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
9119 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
9120 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
9121 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
9123 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7]
9125 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9126 ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7]
9127 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9128 ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9129 ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9131 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,3,2,1]
9132 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7]
9133 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
9135 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7]
9137 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
9138 ; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7]
9139 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9140 ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9141 ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9143 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,3,2,1]
9144 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7]
9145 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
9147 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6,7]
9149 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9150 ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7]
9151 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9152 ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
9153 ; AVX2-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9156 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
9157 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7]
9158 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
9159 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7]
9161 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
9162 ; AVX2-NEXT: # ymm10 = mem[0,1,2,3,4],ymm9[5,6,7]
9163 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
9165 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
9166 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
9167 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
9169 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
9170 ; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
9172 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
9173 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
9174 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
9176 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
9177 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
9179 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7]
9180 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
9181 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
9183 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
9184 ; AVX2-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
9186 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
9187 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
9188 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
9190 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9191 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
9192 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9194 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9196 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9198 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9200 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9202 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9204 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9206 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9208 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9210 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9212 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9214 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9218 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
9225 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
9230 ; AVX2-NEXT: addq $1272, %rsp # imm = 0x4F8
9235 ; AVX2-FP: # %bb.0:
9236 ; AVX2-FP-NEXT: subq $1304, %rsp # imm = 0x518
9244 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9246 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9251 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
9252 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9254 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9255 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
9257 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9258 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
9259 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9261 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9262 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
9263 ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9264 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
9265 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
9266 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
9270 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
9271 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
9273 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
9275 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9277 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9279 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9280 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
9284 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
9285 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
9288 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9290 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9292 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9293 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
9297 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
9298 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9299 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
9302 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9304 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9306 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9307 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
9311 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
9312 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9313 ; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
9314 ; AVX2-FP-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
9317 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9321 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
9325 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9328 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
9331 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9334 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7]
9337 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9341 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
9343 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9344 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9345 ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
9346 ; AVX2-FP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9348 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
9349 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9350 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
9353 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
9356 ; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9357 ; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
9358 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9360 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9362 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9364 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9365 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
9366 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
9369 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9371 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
9373 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
9374 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9375 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9376 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9377 ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
9378 ; AVX2-FP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
9380 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
9381 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9384 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
9385 ; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
9386 ; AVX2-FP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
9387 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9390 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
9392 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9394 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9395 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
9398 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9400 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
9402 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
9403 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9404 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9405 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9406 ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
9407 ; AVX2-FP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
9409 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
9410 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9413 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
9414 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9415 ; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
9416 ; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7]
9417 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9419 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
9421 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9423 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9424 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
9427 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9429 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
9431 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15]
9432 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
9433 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9435 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9437 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9438 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
9442 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7]
9443 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9444 ; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9445 ; AVX2-FP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
9447 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3]
9450 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7]
9451 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9452 ; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
9453 ; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
9455 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
9457 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15]
9458 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7]
9459 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9462 ; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9464 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7]
9466 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9468 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
9470 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
9472 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9474 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
9475 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7]
9477 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
9478 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9479 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9481 ; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9483 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
9484 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9487 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
9488 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9490 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5]
9491 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
9493 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
9494 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9495 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9497 ; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9499 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
9500 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9503 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
9504 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9506 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
9507 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7]
9509 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
9510 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9511 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9513 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
9514 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
9518 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
9520 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
9522 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
9523 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9524 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9526 ; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9527 ; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
9528 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9529 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9530 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9531 ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9532 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
9535 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
9536 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9537 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
9539 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
9540 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
9541 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9542 ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
9543 ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
9545 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1]
9547 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
9549 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
9550 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
9552 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
9554 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
9555 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
9556 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9557 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9558 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9559 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9560 ; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9561 ; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
9562 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9563 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9564 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9565 ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9566 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
9567 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9569 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
9570 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9572 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
9573 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
9574 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9575 ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
9576 ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
9578 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
9579 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9581 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9583 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7]
9584 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
9587 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
9588 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
9589 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9590 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9591 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9592 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9593 ; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
9594 ; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
9595 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9596 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9597 ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9598 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
9600 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
9602 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
9603 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
9604 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9605 ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
9606 ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
9608 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1]
9611 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7]
9612 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7]
9615 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
9616 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
9617 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
9618 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
9619 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9620 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9621 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
9622 ; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9623 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3]
9626 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
9627 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4]
9628 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
9629 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9630 ; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
9631 ; AVX2-FP-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
9632 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9633 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9634 ; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
9636 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
9639 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7]
9640 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
9643 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
9644 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
9645 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
9646 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
9647 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9651 ; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9652 ; AVX2-FP-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5]
9653 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
9654 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
9657 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
9659 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9662 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
9663 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
9664 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
9665 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
9666 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9668 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
9669 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7]
9672 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
9675 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
9676 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
9677 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
9678 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9679 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9681 ; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9682 ; AVX2-FP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5]
9683 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
9684 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9686 ; AVX2-FP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9688 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7]
9689 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9692 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
9693 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
9694 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
9695 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
9697 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5]
9698 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
9702 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
9704 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
9705 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
9706 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
9707 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9708 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9709 ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
9710 ; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9712 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
9713 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
9716 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
9718 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
9719 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
9720 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9721 ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
9722 ; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9724 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
9727 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7]
9729 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
9730 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7]
9731 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9732 ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
9733 ; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9735 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
9738 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7]
9740 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
9741 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
9742 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9743 ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
9744 ; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9746 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
9749 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7]
9751 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
9752 ; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
9753 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
9756 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
9758 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
9759 ; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
9762 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
9764 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9765 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
9768 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
9770 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
9771 ; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
9774 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
9776 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9777 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
9778 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9780 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9782 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9784 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9786 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9788 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9790 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9792 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9794 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9796 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9798 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9800 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9804 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9811 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9816 ; AVX2-FP-NEXT: addq $1304, %rsp # imm = 0x518
9821 ; AVX2-FCP: # %bb.0:
9822 ; AVX2-FCP-NEXT: subq $1304, %rsp # imm = 0x518
9830 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9832 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9837 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
9838 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9840 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9841 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
9843 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9844 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
9845 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9847 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9848 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
9849 ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9850 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
9851 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
9852 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
9856 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
9857 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
9859 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
9861 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9863 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9865 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9866 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
9870 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
9871 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
9874 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9876 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9878 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9879 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
9883 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
9884 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9885 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
9888 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9890 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9892 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9893 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
9897 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
9898 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9899 ; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
9900 ; AVX2-FCP-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
9903 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9907 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
9911 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9914 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
9917 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9920 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7]
9923 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9927 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
9929 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9930 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9931 ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
9932 ; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
9934 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
9935 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9936 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
9939 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
9942 ; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
9943 ; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
9944 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9946 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
9948 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9950 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9951 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
9952 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
9955 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9957 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
9959 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
9960 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9961 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9962 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9963 ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
9964 ; AVX2-FCP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
9966 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
9967 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9970 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
9971 ; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
9972 ; AVX2-FCP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
9973 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9976 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
9978 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9980 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9981 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
9984 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9986 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
9988 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
9989 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9990 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9991 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9992 ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
9993 ; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
9995 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
9996 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9999 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
10000 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10001 ; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
10002 ; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7]
10003 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10005 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
10007 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10009 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10010 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
10013 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10015 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
10017 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15]
10018 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
10019 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10021 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10023 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10024 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
10028 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7]
10029 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10030 ; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10031 ; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
10033 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3]
10036 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7]
10037 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10038 ; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10039 ; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
10041 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
10043 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15]
10044 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7]
10045 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10048 ; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10050 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7]
10052 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10054 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
10056 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
10058 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10060 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
10061 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7]
10063 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10064 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10065 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10067 ; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10069 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
10070 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10073 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10074 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10076 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5]
10077 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
10079 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10080 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10081 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10083 ; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10085 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
10086 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10089 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
10090 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10092 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
10093 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7]
10095 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10096 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10097 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10099 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
10100 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
10104 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
10106 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
10108 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
10109 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10110 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10112 ; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10113 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
10114 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10115 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10116 ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10117 ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10118 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
10121 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
10122 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10123 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
10125 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
10126 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
10127 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10128 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10129 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
10131 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1]
10133 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
10135 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
10136 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
10138 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
10140 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
10141 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
10142 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
10143 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10144 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10145 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10146 ; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10147 ; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
10148 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10149 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10150 ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10151 ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10152 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
10153 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10155 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
10156 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10158 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
10159 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
10160 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10161 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10162 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
10164 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
10165 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10167 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10169 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7]
10170 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
10173 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
10174 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
10175 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
10176 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10177 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10178 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10179 ; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
10180 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
10181 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10182 ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10183 ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10184 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
10186 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
10188 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
10189 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
10190 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10191 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10192 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
10194 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1]
10197 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7]
10198 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7]
10201 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
10202 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
10203 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
10204 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
10205 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10206 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10207 ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10208 ; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10209 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3]
10212 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
10213 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4]
10214 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
10215 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10216 ; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
10217 ; AVX2-FCP-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
10218 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10219 ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10220 ; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10222 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
10225 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7]
10226 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
10229 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
10230 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
10231 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
10232 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
10233 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10237 ; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10238 ; AVX2-FCP-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5]
10239 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
10240 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
10243 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
10245 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10248 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
10249 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
10250 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
10251 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
10252 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10254 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
10255 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7]
10258 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
10261 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
10262 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
10263 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
10264 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10265 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10267 ; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10268 ; AVX2-FCP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5]
10269 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
10270 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10272 ; AVX2-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
10274 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7]
10275 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10278 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
10279 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
10280 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
10281 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
10283 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5]
10284 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
10288 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
10290 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
10291 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
10292 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
10293 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10294 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10295 ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10296 ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10298 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
10299 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
10302 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
10304 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
10305 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
10306 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10307 ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10308 ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10310 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
10313 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7]
10315 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
10316 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7]
10317 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10318 ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10319 ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10321 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
10324 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7]
10326 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10327 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
10328 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10329 ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10330 ; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10332 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
10335 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7]
10337 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
10338 ; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
10339 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
10342 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
10344 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
10345 ; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
10348 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
10350 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10351 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
10354 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
10356 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10357 ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
10360 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
10362 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10363 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
10364 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10366 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10368 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10370 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10372 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10374 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10376 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10378 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10380 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10382 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10384 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10386 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10390 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10397 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
10402 ; AVX2-FCP-NEXT: addq $1304, %rsp # imm = 0x518
10407 ; AVX512: # %bb.0:
10408 ; AVX512-NEXT: subq $1480, %rsp # imm = 0x5C8
10409 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
10411 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10413 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10414 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
10418 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[0,2,0,3]
10419 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
10420 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
10422 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10424 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10425 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
10428 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
10430 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
10434 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10437 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10439 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10440 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
10443 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm22[0,2,0,3]
10444 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
10445 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
10447 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10449 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10450 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
10451 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
10454 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
10455 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10458 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10460 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10461 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
10464 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
10469 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10471 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10472 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
10473 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
10477 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10478 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
10479 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10480 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10482 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10484 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10485 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
10487 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3]
10488 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
10490 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
10493 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10494 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
10498 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7]
10502 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10504 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10506 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10507 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
10510 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
10511 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
10512 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
10515 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10517 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10518 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
10520 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
10521 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10523 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10526 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
10529 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3,4],xmm6[5,6,7]
10533 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10535 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
10540 ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
10541 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
10542 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
10543 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10547 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7]
10550 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
10551 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
10554 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10557 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
10560 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
10561 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10563 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5]
10564 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
10568 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
10569 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
10570 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
10576 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
10579 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
10580 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
10583 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10589 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
10590 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
10591 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10596 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5]
10597 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
10599 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
10600 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
10601 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10603 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10604 ; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10605 ; AVX512-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10608 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
10609 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
10611 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
10612 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7]
10614 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
10616 ; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
10617 ; AVX512-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
10619 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
10620 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7]
10622 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
10623 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
10624 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4]
10626 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
10629 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10630 ; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10631 ; AVX512-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10634 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
10635 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
10637 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
10638 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7]
10640 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
10641 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10642 ; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
10643 ; AVX512-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
10644 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
10647 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
10648 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
10649 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10651 ; AVX512-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
10653 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3]
10654 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7]
10655 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
10656 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1]
10657 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4]
10658 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
10661 ; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
10662 ; AVX512-NEXT: # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7]
10663 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
10666 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
10667 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
10668 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
10670 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
10672 ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm22 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
10674 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10675 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10676 ; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10677 ; AVX512-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
10680 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
10681 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7]
10682 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
10683 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7]
10684 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
10685 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10686 ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10687 ; AVX512-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
10689 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
10690 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7]
10691 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
10692 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1]
10693 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4]
10694 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
10697 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10698 ; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10699 ; AVX512-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
10702 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1]
10703 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7]
10704 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
10705 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7]
10706 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
10707 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10708 ; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
10709 ; AVX512-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
10711 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
10712 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10713 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10714 ; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10715 ; AVX512-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
10717 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
10718 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
10719 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
10720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
10721 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
10722 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
10723 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10724 ; AVX512-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10725 ; AVX512-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
10729 ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
10730 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
10731 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
10735 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10739 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
10740 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
10741 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
10743 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
10746 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
10747 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
10753 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7]
10754 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
10755 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7]
10759 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
10760 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
10761 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5]
10763 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
10764 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
10768 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
10769 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
10770 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
10771 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
10772 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10773 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
10774 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10776 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10778 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10783 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
10784 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
10785 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
10786 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5]
10788 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
10789 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7]
10793 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7]
10794 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
10795 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7]
10797 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
10798 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
10800 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
10802 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
10803 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
10805 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
10806 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
10807 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
10811 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10812 ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10813 ; AVX512-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
10814 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
10819 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
10821 ; AVX512-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload
10822 ; AVX512-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7]
10824 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1]
10825 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
10827 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
10828 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
10831 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
10834 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10835 ; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
10836 ; AVX512-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
10837 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10838 ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10839 ; AVX512-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
10844 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
10845 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
10846 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
10854 ; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
10855 ; AVX512-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7]
10856 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10858 ; AVX512-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7]
10860 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1]
10861 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
10863 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
10866 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
10868 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
10872 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
10874 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10875 ; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
10876 ; AVX512-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7]
10877 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10878 ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload
10879 ; AVX512-NEXT: # ymm15 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
10883 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
10887 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10888 ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
10889 ; AVX512-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
10893 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
10894 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10895 ; AVX512-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
10896 ; AVX512-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
10898 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1]
10899 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7]
10900 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
10902 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
10906 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10907 ; AVX512-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
10908 ; AVX512-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
10909 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10910 ; AVX512-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10911 ; AVX512-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
10914 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1]
10915 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
10916 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
10917 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
10921 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7]
10926 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
10927 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
10928 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
10932 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
10933 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
10934 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7]
10939 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
10940 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6,7]
10949 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
10950 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
10951 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7]
10956 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
10961 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
10962 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7]
10966 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
10967 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
10972 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
10973 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
10974 ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7]
10980 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
10981 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3]
10982 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7]
10984 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
10986 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
10987 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10988 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
10990 ; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800
10992 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10995 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10996 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
10998 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11001 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11002 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
11004 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11007 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11008 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
11010 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11013 ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm6 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
11018 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11020 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11026 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
11029 ; AVX512-NEXT: addq $1480, %rsp # imm = 0x5C8
11034 ; AVX512-FCP: # %bb.0:
11035 ; AVX512-FCP-NEXT: subq $1416, %rsp # imm = 0x588
11036 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
11038 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11040 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11041 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
11045 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
11048 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
11050 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11052 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11053 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
11054 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
11059 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
11062 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11065 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11067 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11068 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
11071 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
11074 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
11076 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11078 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11079 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
11080 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
11083 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
11084 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11086 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11088 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11089 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
11094 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
11098 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11100 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11101 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
11102 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
11106 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
11107 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
11108 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11109 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11111 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11113 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11114 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
11117 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,0,3]
11119 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
11121 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11123 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
11128 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
11131 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11133 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11135 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11136 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
11139 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
11141 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
11144 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11146 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11147 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
11149 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
11150 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11154 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11155 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
11159 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7]
11162 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11164 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
11169 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
11170 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
11171 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11172 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11176 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2],xmm0[3],xmm14[4,5],xmm0[6,7]
11179 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
11180 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
11183 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11186 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7]
11189 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
11190 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11192 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
11193 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
11197 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
11198 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
11199 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11205 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
11208 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
11209 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
11212 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11218 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
11219 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
11220 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11225 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5]
11226 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
11228 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
11229 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
11230 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11232 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11233 ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11234 ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11237 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
11238 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1]
11241 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7]
11243 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
11245 ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11246 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
11248 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
11249 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
11252 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
11253 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
11255 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
11258 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11259 ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11260 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
11263 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
11264 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
11267 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7]
11269 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7]
11270 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11271 ; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
11272 ; AVX512-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
11273 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
11276 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
11277 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11278 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11280 ; AVX512-FCP-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
11282 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3]
11285 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1]
11286 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4]
11287 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7]
11290 ; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
11291 ; AVX512-FCP-NEXT: # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7]
11292 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
11295 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
11296 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
11297 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
11299 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
11301 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
11303 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11304 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11305 ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11306 ; AVX512-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
11308 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1]
11311 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7]
11312 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
11313 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11314 ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11315 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
11317 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3]
11319 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
11320 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4]
11321 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
11324 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11325 ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11326 ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11328 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1]
11331 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7]
11332 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
11333 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11334 ; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
11335 ; AVX512-FCP-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
11337 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
11338 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11339 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11340 ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11341 ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11343 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
11345 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
11346 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4]
11347 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
11348 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11349 ; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
11350 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
11354 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15]
11355 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
11356 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
11366 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
11371 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5]
11372 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7]
11375 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
11381 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7]
11385 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7]
11386 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
11389 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5]
11390 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7]
11394 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15]
11395 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5]
11396 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
11402 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
11403 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11404 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
11405 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11407 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11409 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11411 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7]
11413 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5]
11414 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
11420 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
11422 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
11423 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
11426 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
11427 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
11429 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
11430 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
11431 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11435 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11436 ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
11437 ; AVX512-FCP-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11438 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
11443 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
11445 ; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11446 ; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
11448 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
11449 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
11454 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
11457 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11458 ; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
11459 ; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
11460 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11461 ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
11462 ; AVX512-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11466 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
11467 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
11468 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
11476 ; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
11477 ; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
11478 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11480 ; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
11482 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1]
11487 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
11489 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
11493 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
11495 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11496 ; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
11497 ; AVX512-FCP-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
11498 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11499 ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
11500 ; AVX512-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
11504 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
11507 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11508 ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
11509 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11513 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
11514 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11515 ; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
11516 ; AVX512-FCP-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
11518 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1]
11521 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7]
11525 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11526 ; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11527 ; AVX512-FCP-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
11528 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11529 ; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
11530 ; AVX512-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
11532 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
11535 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7]
11539 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm4[5,6,7]
11544 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7]
11553 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
11554 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
11559 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7]
11567 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
11572 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7]
11577 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7]
11581 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7]
11587 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7]
11594 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
11596 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7]
11598 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
11599 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11600 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
11602 ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
11604 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11607 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11608 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
11610 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11613 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11614 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
11616 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11619 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11620 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
11622 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
11630 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11636 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11639 ; AVX512-FCP-NEXT: addq $1416, %rsp # imm = 0x588
11644 ; AVX512DQ: # %bb.0:
11645 ; AVX512DQ-NEXT: subq $840, %rsp # imm = 0x348
11646 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
11648 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11650 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11651 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
11655 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,0,3]
11656 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
11657 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
11659 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11661 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11662 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
11666 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
11668 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
11674 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11676 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11677 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
11680 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3]
11681 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
11682 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
11684 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11686 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11687 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
11688 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
11691 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
11694 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11696 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11697 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
11700 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
11705 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11707 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11708 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3,4,5],ymm4[6],ymm1[7]
11709 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
11713 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
11714 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
11715 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
11716 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
11718 ; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800
11721 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11723 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11725 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11726 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
11728 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
11729 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
11731 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
11733 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11735 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11736 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
11739 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7]
11746 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11747 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
11750 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
11751 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
11752 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
11756 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
11760 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm2[3,4,5,6,7]
11762 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11764 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11765 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
11768 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4],xmm8[5,6,7]
11773 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
11779 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
11780 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
11781 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11784 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11788 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7]
11791 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
11792 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
11797 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7]
11800 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7]
11802 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5]
11803 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
11807 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
11808 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
11809 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11812 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11817 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
11821 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
11822 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
11830 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7]
11831 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
11836 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
11837 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
11839 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
11840 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
11841 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
11844 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11845 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11846 ; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11847 ; AVX512DQ-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11850 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
11851 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
11853 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
11854 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7]
11856 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
11857 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11858 ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11859 ; AVX512DQ-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
11861 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
11862 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7]
11864 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
11865 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
11866 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4]
11868 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
11872 ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11873 ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11876 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
11877 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
11879 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
11880 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7]
11882 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
11885 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2],ymm0[3],ymm3[4],ymm0[5,6],ymm3[7]
11886 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
11889 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
11890 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
11891 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11892 ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11893 ; AVX512DQ-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
11895 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3]
11896 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7]
11897 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
11898 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1]
11899 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4]
11900 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
11904 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7]
11905 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
11908 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
11909 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
11910 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
11912 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
11914 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm18 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
11916 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11917 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11918 ; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11919 ; AVX512DQ-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
11922 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
11923 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7]
11924 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
11925 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7]
11926 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
11927 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11928 ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11929 ; AVX512DQ-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
11931 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
11932 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7]
11933 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
11934 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1]
11935 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4]
11936 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
11939 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11940 ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11941 ; AVX512DQ-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
11944 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1]
11945 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7]
11946 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
11947 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7]
11948 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
11949 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11950 ; AVX512DQ-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
11951 ; AVX512DQ-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
11953 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
11954 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11955 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11956 ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
11957 ; AVX512DQ-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
11959 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
11960 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
11961 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
11962 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
11963 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
11964 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
11965 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11966 ; AVX512DQ-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
11967 ; AVX512DQ-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
11971 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
11972 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
11973 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
11980 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
11981 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
11982 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
11984 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
11987 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
11988 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
11994 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7]
11995 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
11996 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7]
12000 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
12001 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12002 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5]
12004 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
12005 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
12009 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
12010 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
12011 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
12016 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
12017 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
12018 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
12019 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5]
12021 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
12022 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7]
12026 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7]
12027 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
12028 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7]
12030 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
12031 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
12033 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
12035 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
12036 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
12038 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
12039 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
12040 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
12044 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12045 ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
12046 ; AVX512DQ-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12047 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
12051 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
12052 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12053 ; AVX512DQ-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12054 ; AVX512DQ-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
12056 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
12057 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
12059 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
12060 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
12063 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
12068 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
12070 ; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
12071 ; AVX512DQ-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
12076 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
12077 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
12078 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
12084 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1}
12087 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm2[1],ymm4[2,3,4,5],ymm2[6],ymm4[7]
12088 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12089 ; AVX512DQ-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12090 ; AVX512DQ-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
12092 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
12093 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
12095 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
12098 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
12100 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
12104 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
12106 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12107 ; AVX512DQ-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
12108 ; AVX512DQ-NEXT: # ymm14 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
12109 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12110 ; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
12111 ; AVX512DQ-NEXT: # ymm15 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
12115 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
12118 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12119 ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
12120 ; AVX512DQ-NEXT: # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12124 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
12125 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12126 ; AVX512DQ-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12127 ; AVX512DQ-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
12129 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1]
12130 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7]
12131 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
12133 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7]
12136 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm25 {%k1}
12137 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12138 ; AVX512DQ-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
12139 ; AVX512DQ-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
12140 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12141 ; AVX512DQ-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
12142 ; AVX512DQ-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
12145 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1]
12146 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,0,2,4,5,6,7]
12147 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
12148 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6,7]
12152 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7]
12157 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
12158 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7]
12159 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
12163 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
12164 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
12165 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7]
12170 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
12171 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
12176 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1}
12180 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
12181 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
12182 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6,7]
12187 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7]
12192 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
12193 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7]
12197 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
12198 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7]
12200 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
12201 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
12202 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7]
12205 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm1 {%k1}
12208 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
12209 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
12210 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7]
12212 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
12214 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12216 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12218 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12220 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12222 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
12228 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
12234 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
12237 ; AVX512DQ-NEXT: addq $840, %rsp # imm = 0x348
12242 ; AVX512DQ-FCP: # %bb.0:
12243 ; AVX512DQ-FCP-NEXT: subq $872, %rsp # imm = 0x368
12244 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
12246 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12248 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12249 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
12253 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
12256 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
12258 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12260 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12261 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
12262 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
12268 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
12273 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12275 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12276 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
12279 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3]
12282 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
12284 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12286 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12287 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
12288 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
12291 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
12293 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12295 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12296 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
12301 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
12305 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12307 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12308 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
12309 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
12313 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
12314 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
12315 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12316 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12318 ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
12321 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12323 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12325 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12326 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
12329 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
12331 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
12333 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12336 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
12340 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
12344 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12346 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12347 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
12350 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,1,0,3]
12352 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
12355 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12357 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
12360 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3,4,5,6,7]
12362 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12364 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12365 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
12369 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7]
12373 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
12379 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
12380 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
12381 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
12384 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12388 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm13[2],xmm0[3],xmm13[4,5],xmm0[6,7]
12391 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
12392 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
12397 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7]
12400 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7]
12402 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
12403 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
12407 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
12408 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
12409 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12412 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12417 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
12421 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
12422 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
12430 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7]
12431 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
12436 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
12437 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
12439 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
12440 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
12441 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12444 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12445 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12446 ; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12447 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
12450 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
12451 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1]
12453 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7]
12454 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
12455 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12457 ; AVX512DQ-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
12459 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
12460 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
12463 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
12464 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
12466 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
12469 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12470 ; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12471 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
12474 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
12475 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
12478 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7]
12480 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7]
12482 ; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12483 ; AVX512DQ-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
12484 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
12487 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
12488 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
12489 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12490 ; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12491 ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
12493 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3]
12496 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1]
12497 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4]
12499 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7]
12503 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7]
12504 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
12508 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
12509 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
12510 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
12512 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12514 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
12516 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12517 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12518 ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12519 ; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
12521 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
12525 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7]
12526 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
12527 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12528 ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12529 ; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
12531 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
12533 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
12534 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4]
12535 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
12538 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12539 ; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12540 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
12542 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1]
12545 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7]
12546 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
12547 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12548 ; AVX512DQ-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
12549 ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7]
12551 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
12552 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12553 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12554 ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12555 ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12557 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
12559 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1]
12560 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4]
12561 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7]
12562 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12563 ; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
12564 ; AVX512DQ-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
12568 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
12569 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
12570 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
12578 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
12583 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5]
12584 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7]
12587 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
12592 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7]
12596 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7]
12597 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12601 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5]
12602 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
12607 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
12608 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5]
12609 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
12617 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
12619 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5]
12620 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7]
12625 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7]
12627 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
12628 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
12631 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
12632 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
12634 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
12635 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
12636 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12640 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12641 ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
12642 ; AVX512DQ-FCP-NEXT: # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12643 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
12647 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
12648 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12650 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
12652 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
12653 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
12658 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
12662 ; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
12663 ; AVX512DQ-FCP-NEXT: # ymm14 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
12664 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12665 ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
12666 ; AVX512DQ-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12670 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
12671 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
12672 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
12677 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1}
12680 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7]
12681 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12682 ; AVX512DQ-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12683 ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
12685 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1]
12690 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7]
12692 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
12696 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
12698 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12699 ; AVX512DQ-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
12700 ; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
12701 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12702 ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
12703 ; AVX512DQ-FCP-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
12708 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
12712 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12713 ; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
12714 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
12718 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
12719 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12720 ; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12721 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
12723 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,3,2,1]
12726 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7]
12729 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1}
12730 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12731 ; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12732 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
12733 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12734 ; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
12735 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
12737 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
12740 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5],xmm12[6,7]
12744 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7]
12749 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6,7]
12756 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7]
12757 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
12762 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7]
12765 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm0 {%k1}
12770 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7]
12775 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7]
12783 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
12787 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7]
12790 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6,7]
12793 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm2 {%k1}
12797 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
12799 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
12801 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12803 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12805 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12807 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12814 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12820 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12823 ; AVX512DQ-FCP-NEXT: addq $872, %rsp # imm = 0x368
12828 ; AVX512BW: # %bb.0:
12829 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
12842 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
12843 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
12846 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
12847 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
12850 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
12853 ; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800
12856 ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800
12864 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
12865 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
12868 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
12869 ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
12872 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
12882 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
12883 ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
12886 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
12887 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
12890 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
12893 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800
12896 ; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
12904 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
12905 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
12908 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
12909 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
12912 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
12922 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
12925 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
12926 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
12932 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
12933 ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
12942 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
12944 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
12945 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
12948 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
12949 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
12973 ; AVX512BW-FCP: # %bb.0:
12974 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12987 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
12988 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
12991 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
12992 ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
12995 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
12998 ; AVX512BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800
13001 ; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800
13009 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
13010 ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
13013 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
13014 ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
13017 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
13027 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
13028 ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
13031 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
13032 ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
13035 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
13038 ; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800
13041 ; AVX512BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
13049 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
13050 ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
13053 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
13054 ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
13057 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
13067 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
13070 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
13071 ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
13077 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
13078 ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
13087 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
13089 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
13090 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
13093 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
13094 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
13118 ; AVX512DQ-BW: # %bb.0:
13119 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
13132 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
13133 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
13136 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
13137 ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
13140 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
13143 ; AVX512DQ-BW-NEXT: movl $4192256, %edi # imm = 0x3FF800
13146 ; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800
13154 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
13155 ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
13158 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
13159 ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
13162 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
13172 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
13173 ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
13176 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
13177 ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
13180 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
13183 ; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800
13186 ; AVX512DQ-BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
13194 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
13195 ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
13198 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
13199 ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
13202 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
13212 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
13215 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
13216 ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
13222 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
13223 ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
13232 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
13234 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
13235 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
13238 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
13239 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
13263 ; AVX512DQ-BW-FCP: # %bb.0:
13264 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13277 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58]
13278 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
13281 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
13282 ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
13285 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
13288 ; AVX512DQ-BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800
13291 ; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800
13299 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59]
13300 ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
13303 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
13304 ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
13307 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
13317 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60]
13318 ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
13321 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
13322 ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
13325 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
13328 ; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800
13331 ; AVX512DQ-BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
13339 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61]
13340 ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
13343 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
13344 ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
13347 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
13357 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0]
13360 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
13361 ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
13367 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30]
13368 ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
13377 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0]
13379 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
13380 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
13383 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31]
13384 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
13407 %strided.vec0 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378>