Lines Matching +full:0 +full:x1c48
20 ; SSE: # %bb.0:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
29 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
32 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
35 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
46 ; AVX: # %bb.0:
47 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
52 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
55 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
58 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
69 ; AVX2: # %bb.0:
70 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
75 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
78 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
81 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
92 ; AVX2-FP: # %bb.0:
93 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
98 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
101 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
104 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
115 ; AVX2-FCP: # %bb.0:
116 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
121 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
124 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
127 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
138 ; AVX512: # %bb.0:
139 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
144 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
147 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
150 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
161 ; AVX512-FCP: # %bb.0:
162 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
167 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
170 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
173 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
184 ; AVX512DQ: # %bb.0:
185 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
190 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
193 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
196 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
207 ; AVX512DQ-FCP: # %bb.0:
208 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
213 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
216 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
219 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
230 ; AVX512BW: # %bb.0:
231 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
236 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
239 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
242 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
253 ; AVX512BW-FCP: # %bb.0:
254 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
259 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
262 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
265 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
276 ; AVX512DQ-BW: # %bb.0:
277 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
282 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
285 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
288 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
299 ; AVX512DQ-BW-FCP: # %bb.0:
300 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
305 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0]
308 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0]
311 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0]
321 %strided.vec0 = shufflevector <12 x i64> %wide.vec, <12 x i64> poison, <2 x i32> <i32 0, i32 6>
338 ; SSE: # %bb.0:
339 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
353 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0]
355 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0]
359 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm7[0]
361 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm6[0]
365 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
367 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0]
385 ; AVX: # %bb.0:
386 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
391 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
396 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm8[0]
397 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
400 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
402 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
404 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm8[0]
405 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
408 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
410 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
412 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm8[0]
413 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
416 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
427 ; AVX2: # %bb.0:
428 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
436 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm6[0]
437 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
438 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
439 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
443 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
445 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
447 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm8[0]
448 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
451 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
452 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
454 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm4[0]
455 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
456 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
457 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
461 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
472 ; AVX2-FP: # %bb.0:
473 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
481 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm6[0]
482 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
483 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
484 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
488 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
490 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
492 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm8[0]
493 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
496 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
497 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
499 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm4[0]
500 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
501 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
502 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
506 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
517 ; AVX2-FCP: # %bb.0:
518 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
526 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm6[0]
527 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
528 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
529 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
533 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
535 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
537 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm8[0]
538 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
541 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
542 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
544 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm4[0]
545 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
546 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
547 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
551 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
562 ; AVX512: # %bb.0:
563 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
566 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0]
569 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
570 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
573 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
574 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0]
578 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
580 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
582 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
584 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
585 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
588 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
593 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
604 ; AVX512-FCP: # %bb.0:
605 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
608 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0]
611 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
612 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
614 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
615 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
617 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
621 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
623 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
624 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
628 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
633 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
644 ; AVX512DQ: # %bb.0:
645 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
648 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0]
651 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
652 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
655 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
656 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0]
660 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
662 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
664 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
666 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
667 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
670 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
675 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
686 ; AVX512DQ-FCP: # %bb.0:
687 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
690 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0]
693 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
694 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
696 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
697 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
699 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
703 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
705 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
706 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
710 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
715 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
726 ; AVX512BW: # %bb.0:
727 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
730 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0]
733 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
734 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
737 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
738 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0]
742 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
744 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
746 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
748 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
749 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
752 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
757 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
768 ; AVX512BW-FCP: # %bb.0:
769 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
772 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0]
775 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
776 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
778 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
779 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
781 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
785 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
787 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
788 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
792 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
797 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
808 ; AVX512DQ-BW: # %bb.0:
809 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
812 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0]
815 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
816 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
819 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
820 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0]
824 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
826 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
828 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
830 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
831 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
834 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
839 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
850 ; AVX512DQ-BW-FCP: # %bb.0:
851 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
854 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0]
857 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
858 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0]
860 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
861 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
863 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
867 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
869 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
870 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
874 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
879 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
889 %strided.vec0 = shufflevector <24 x i64> %wide.vec, <24 x i64> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18>
906 ; SSE: # %bb.0:
924 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0]
928 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
929 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
931 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
933 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0]
935 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
937 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0]
939 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
941 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
942 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
944 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
946 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0]
948 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
950 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0]
952 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
955 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0]
960 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
965 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
970 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
975 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
978 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
983 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
985 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
988 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
993 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
996 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
998 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1000 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1006 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1015 ; AVX: # %bb.0:
1022 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2]
1026 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm10[0]
1027 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1028 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1030 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
1033 ; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm12[0]
1034 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm2[4,5,6,7]
1035 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1038 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm3[4,5,6,7]
1039 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1042 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1044 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
1046 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm11[0]
1047 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
1049 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm4[0],ymm12[0],ymm4[2],ymm12[2]
1052 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
1053 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
1058 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1061 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
1063 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm15[0],ymm8[2],ymm15[2]
1065 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm10[0],xmm12[0]
1066 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
1069 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm0[0],ymm13[2],ymm0[2]
1072 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1073 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
1076 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
1079 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
1080 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1082 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1085 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1093 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1100 ; AVX2: # %bb.0:
1109 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0]
1110 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm5[0],ymm9[2],ymm5[2]
1111 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
1112 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1113 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1116 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm8[0]
1117 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm2[0],ymm7[2],ymm2[2]
1118 ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
1119 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
1123 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7]
1127 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
1129 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
1131 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0]
1132 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
1134 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
1137 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
1138 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
1143 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
1144 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
1147 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1]
1148 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
1150 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
1151 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm15[0],ymm5[2],ymm15[2]
1152 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
1153 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
1156 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
1157 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
1158 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0]
1159 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1163 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
1167 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
1169 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1179 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1186 ; AVX2-FP: # %bb.0:
1195 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0]
1196 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm5[0],ymm9[2],ymm5[2]
1197 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
1198 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1199 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1202 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm8[0]
1203 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm2[0],ymm7[2],ymm2[2]
1204 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
1205 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
1209 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7]
1213 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
1215 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
1217 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0]
1218 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
1220 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
1223 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
1224 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
1229 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
1230 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
1233 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1]
1234 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
1236 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
1237 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm15[0],ymm5[2],ymm15[2]
1238 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
1239 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
1242 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
1243 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
1244 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0]
1245 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1249 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
1253 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
1255 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1265 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1272 ; AVX2-FCP: # %bb.0:
1281 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0]
1282 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm5[0],ymm9[2],ymm5[2]
1283 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
1284 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1285 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1288 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm8[0]
1289 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm2[0],ymm7[2],ymm2[2]
1290 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
1291 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
1295 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7]
1299 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
1301 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
1303 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0]
1304 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
1306 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
1309 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
1310 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
1315 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
1316 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
1319 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1]
1320 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
1322 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
1323 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm15[0],ymm5[2],ymm15[2]
1324 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
1325 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
1328 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
1329 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
1330 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0]
1331 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1335 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
1339 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
1341 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1351 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1358 ; AVX512: # %bb.0:
1359 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1366 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
1367 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1369 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0]
1375 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1377 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12]
1378 ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1380 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13]
1381 ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
1383 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6]
1384 ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
1386 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7]
1387 ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
1393 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
1394 ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1396 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0]
1400 ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1404 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1406 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0]
1415 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1417 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0]
1421 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6]
1422 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1425 ; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
1427 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7]
1428 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1431 ; AVX512-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0
1443 ; AVX512-FCP: # %bb.0:
1444 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1451 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
1452 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1454 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0]
1460 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1462 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12]
1463 ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1465 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13]
1466 ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
1468 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6]
1469 ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
1471 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7]
1472 ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
1478 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
1479 ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1481 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0]
1485 ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1489 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1491 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0]
1500 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1502 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0]
1506 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6]
1507 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1510 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
1512 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7]
1513 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1516 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0
1528 ; AVX512DQ: # %bb.0:
1529 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1536 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
1537 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1539 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0]
1545 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1547 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12]
1548 ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1550 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13]
1551 ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
1553 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6]
1554 ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
1556 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7]
1557 ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
1563 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
1564 ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1566 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0]
1570 ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1574 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1576 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0]
1585 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1587 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0]
1591 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6]
1592 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1595 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
1597 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7]
1598 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1601 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0
1613 ; AVX512DQ-FCP: # %bb.0:
1614 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1621 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
1622 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1624 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0]
1630 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1632 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12]
1633 ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1635 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13]
1636 ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
1638 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6]
1639 ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
1641 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7]
1642 ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
1648 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
1649 ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1651 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0]
1655 ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1659 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1661 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0]
1670 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1672 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0]
1676 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6]
1677 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1680 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
1682 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7]
1683 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1686 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0
1698 ; AVX512BW: # %bb.0:
1699 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1706 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
1707 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1709 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0]
1715 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1717 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12]
1718 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1720 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13]
1721 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
1723 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6]
1724 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
1726 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7]
1727 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
1733 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
1734 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1736 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0]
1740 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1744 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1746 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0]
1755 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1757 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0]
1761 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6]
1762 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1765 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
1767 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7]
1768 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1771 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0
1783 ; AVX512BW-FCP: # %bb.0:
1784 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1791 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
1792 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1794 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0]
1800 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1802 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12]
1803 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1805 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13]
1806 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
1808 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6]
1809 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
1811 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7]
1812 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
1818 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
1819 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1821 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0]
1825 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1829 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1831 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0]
1840 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1842 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0]
1846 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6]
1847 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1850 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
1852 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7]
1853 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1856 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0
1868 ; AVX512DQ-BW: # %bb.0:
1869 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1876 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
1877 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1879 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0]
1885 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1887 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12]
1888 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1890 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13]
1891 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
1893 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6]
1894 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
1896 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7]
1897 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
1903 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
1904 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1906 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0]
1910 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1914 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1916 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0]
1925 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1927 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0]
1931 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6]
1932 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1935 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
1937 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7]
1938 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1941 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0
1953 ; AVX512DQ-BW-FCP: # %bb.0:
1954 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1961 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
1962 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1964 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0]
1970 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1972 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12]
1973 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1975 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13]
1976 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
1978 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6]
1979 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
1981 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7]
1982 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
1988 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
1989 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1991 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0]
1995 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1999 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2001 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0]
2010 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2012 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0]
2016 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6]
2017 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
2020 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
2022 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7]
2023 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
2026 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0
2037 %strided.vec0 = shufflevector <48 x i64> %wide.vec, <48 x i64> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
2054 ; SSE: # %bb.0:
2055 ; SSE-NEXT: subq $408, %rsp # imm = 0x198
2072 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0]
2073 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2075 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2077 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
2078 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2080 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2082 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
2083 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2085 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2087 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
2088 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2090 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2092 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
2093 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2095 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2097 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2098 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2100 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2102 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2103 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2105 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2108 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2109 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2111 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2115 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2116 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2118 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2122 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2123 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2125 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2129 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2130 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2132 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2136 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2137 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2139 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2143 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2144 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2150 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2151 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2153 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2157 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2158 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2163 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2164 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2169 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2170 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2172 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2176 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2177 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2182 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
2187 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0]
2192 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
2197 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
2202 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
2207 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2209 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2211 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2213 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2215 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2217 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2219 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2221 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2223 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2225 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2227 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2229 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2231 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2233 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2235 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2237 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2239 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2241 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2243 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2245 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2247 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2249 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2251 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2253 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2255 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2259 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2263 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2265 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2267 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2269 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2277 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2279 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2281 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2289 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2291 ; SSE-NEXT: addq $408, %rsp # imm = 0x198
2295 ; AVX: # %bb.0:
2296 ; AVX-NEXT: subq $552, %rsp # imm = 0x228
2298 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2303 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
2306 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm6[0]
2307 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm4[4,5,6,7]
2308 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2310 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
2313 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0]
2314 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm7[4,5,6,7]
2315 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2317 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
2320 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
2321 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7]
2322 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2325 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm10[0],ymm13[2],ymm10[2]
2328 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0]
2329 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
2330 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2333 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2334 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2339 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2340 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2343 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2344 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
2347 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2349 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7]
2350 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2353 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
2354 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2356 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm14[0],ymm2[2],ymm14[2]
2358 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0]
2359 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7]
2360 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2362 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
2365 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0]
2366 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7]
2367 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2370 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm15[0],ymm6[2],ymm15[2]
2373 ; AVX-NEXT: vmovlhps {{.*#+}} xmm12 = xmm3[0],xmm4[0]
2374 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7]
2375 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2378 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm2[0],ymm5[2],ymm2[2]
2381 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0]
2382 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7]
2383 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2384 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2386 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2388 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
2389 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2390 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2393 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
2394 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2397 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2398 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2401 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2402 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2404 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2406 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2407 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2410 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm14[0]
2411 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7]
2412 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2416 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
2418 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm9[0]
2419 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7]
2420 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2423 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
2426 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm6[0]
2427 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7]
2428 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2431 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm5[0],ymm2[2],ymm5[2]
2434 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0]
2435 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7]
2438 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2439 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2440 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2443 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2446 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2449 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2450 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2452 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2454 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2456 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2458 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2460 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2464 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2466 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2468 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2470 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2472 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2474 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2476 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2478 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2480 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2483 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2485 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2487 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2489 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2494 ; AVX-NEXT: addq $552, %rsp # imm = 0x228
2499 ; AVX2: # %bb.0:
2500 ; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8
2503 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2506 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2509 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2512 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
2513 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2]
2514 ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2515 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
2516 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
2517 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2520 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0]
2521 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
2522 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2523 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
2524 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
2525 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2528 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0]
2529 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
2531 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2532 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2533 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7]
2534 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2537 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
2538 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
2541 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0]
2542 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2543 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2547 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
2548 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2552 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2553 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2557 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2558 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2563 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2564 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2566 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2567 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
2570 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0]
2571 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2572 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2574 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
2575 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
2578 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
2579 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
2580 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2582 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2583 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
2586 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0]
2587 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
2588 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2590 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
2593 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0]
2594 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
2595 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2599 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
2600 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2601 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2605 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
2606 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2607 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2611 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
2612 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2617 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
2618 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2619 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2620 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2621 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
2622 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
2625 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0]
2626 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2627 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2628 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2629 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
2630 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3]
2633 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0]
2634 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2635 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2636 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
2637 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2640 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0]
2641 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2642 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
2643 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2646 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0]
2647 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2651 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7]
2655 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2659 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2663 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2664 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2666 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2668 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2670 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2672 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2674 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2676 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2678 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2680 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2682 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2684 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2686 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2688 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2692 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2694 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2699 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2701 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2706 ; AVX2-NEXT: addq $488, %rsp # imm = 0x1E8
2711 ; AVX2-FP: # %bb.0:
2712 ; AVX2-FP-NEXT: subq $488, %rsp # imm = 0x1E8
2715 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2718 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2721 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2724 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
2725 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2]
2726 ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2727 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
2728 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
2729 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2732 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0]
2733 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
2734 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2735 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
2736 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
2737 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2740 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0]
2741 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
2743 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2744 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2745 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7]
2746 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2749 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
2750 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
2753 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0]
2754 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2755 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2759 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
2760 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2764 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2765 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2769 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2770 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2775 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2776 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2778 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2779 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
2782 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0]
2783 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2784 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2786 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
2787 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
2790 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
2791 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
2792 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2794 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2795 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
2798 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0]
2799 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
2800 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2802 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
2805 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0]
2806 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
2807 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2811 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
2812 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2813 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2817 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
2818 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2819 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2823 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
2824 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2829 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
2830 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2831 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2832 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2833 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
2834 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
2837 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0]
2838 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2839 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2840 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2841 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
2842 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3]
2845 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0]
2846 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2847 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2848 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
2849 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2852 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0]
2853 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2854 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
2855 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2858 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0]
2859 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
2863 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7]
2867 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2871 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2875 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2876 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2878 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2880 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2882 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2884 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2886 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2888 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2890 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2892 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2894 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2896 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2898 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2900 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2904 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2906 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2911 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2913 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2918 ; AVX2-FP-NEXT: addq $488, %rsp # imm = 0x1E8
2923 ; AVX2-FCP: # %bb.0:
2924 ; AVX2-FCP-NEXT: subq $488, %rsp # imm = 0x1E8
2927 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2930 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2933 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2936 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
2937 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2]
2938 ; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2939 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
2940 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
2941 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2944 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0]
2945 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
2946 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2947 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
2948 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
2949 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2952 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0]
2953 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
2955 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2956 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
2957 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7]
2958 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2961 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
2962 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
2965 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0]
2966 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2967 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2971 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
2972 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2976 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2977 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2981 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2982 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2987 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2988 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2990 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2991 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
2994 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0]
2995 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2996 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2998 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
2999 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
3002 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
3003 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
3004 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3006 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3007 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
3010 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0]
3011 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
3012 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3014 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
3017 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0]
3018 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
3019 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3023 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
3024 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
3025 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3029 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
3030 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
3031 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3035 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
3036 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
3041 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
3042 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3043 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3044 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3045 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
3046 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
3049 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0]
3050 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3051 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3052 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3053 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
3054 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3]
3057 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0]
3058 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3059 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3060 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
3061 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
3064 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0]
3065 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3066 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
3067 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
3070 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0]
3071 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
3075 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7]
3079 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7]
3083 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3087 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3088 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3090 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3092 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3094 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3096 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3098 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3100 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3102 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3104 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3106 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3108 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3110 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3112 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3116 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3118 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3123 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3125 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3130 ; AVX2-FCP-NEXT: addq $488, %rsp # imm = 0x1E8
3135 ; AVX512: # %bb.0:
3136 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
3149 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10]
3150 ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3153 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0]
3160 ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3172 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11]
3173 ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
3176 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0]
3181 ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3192 ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3195 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0]
3201 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12]
3202 ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
3214 ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3217 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0]
3221 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13]
3222 ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3231 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6]
3232 ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3235 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6]
3236 ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
3241 ; AVX512-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25
3245 ; AVX512-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
3248 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7]
3249 ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3251 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7]
3252 ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3255 ; AVX512-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9
3259 ; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
3278 ; AVX512-FCP: # %bb.0:
3279 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3292 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10]
3293 ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3296 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0]
3303 ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3315 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11]
3316 ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
3319 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0]
3324 ; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3335 ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3338 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0]
3344 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12]
3345 ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
3357 ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3360 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0]
3364 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13]
3365 ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3374 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6]
3375 ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3378 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6]
3379 ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
3384 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25
3388 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
3391 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7]
3392 ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3394 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7]
3395 ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3398 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9
3402 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
3421 ; AVX512DQ: # %bb.0:
3422 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
3435 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10]
3436 ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3439 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0]
3446 ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3458 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11]
3459 ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
3462 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0]
3467 ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3478 ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3481 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0]
3487 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12]
3488 ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
3500 ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3503 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0]
3507 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13]
3508 ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3517 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6]
3518 ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3521 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6]
3522 ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
3527 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25
3531 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
3534 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7]
3535 ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3537 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7]
3538 ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3541 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9
3545 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
3564 ; AVX512DQ-FCP: # %bb.0:
3565 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3578 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10]
3579 ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3582 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0]
3589 ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3601 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11]
3602 ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
3605 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0]
3610 ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3621 ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3624 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0]
3630 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12]
3631 ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
3643 ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3646 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0]
3650 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13]
3651 ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3660 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6]
3661 ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3664 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6]
3665 ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
3670 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25
3674 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
3677 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7]
3678 ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3680 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7]
3681 ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3684 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9
3688 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
3707 ; AVX512BW: # %bb.0:
3708 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3721 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10]
3722 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3725 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0]
3732 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3744 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11]
3745 ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
3748 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0]
3753 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3764 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3767 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0]
3773 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12]
3774 ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
3786 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3789 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0]
3793 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13]
3794 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3803 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6]
3804 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3807 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6]
3808 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
3813 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25
3817 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
3820 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7]
3821 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3823 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7]
3824 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3827 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9
3831 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
3850 ; AVX512BW-FCP: # %bb.0:
3851 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3864 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10]
3865 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3868 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0]
3875 ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3887 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11]
3888 ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
3891 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0]
3896 ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3907 ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3910 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0]
3916 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12]
3917 ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
3929 ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3932 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0]
3936 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13]
3937 ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3946 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6]
3947 ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3950 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6]
3951 ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
3956 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25
3960 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
3963 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7]
3964 ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
3966 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7]
3967 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3970 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9
3974 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
3993 ; AVX512DQ-BW: # %bb.0:
3994 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4007 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10]
4008 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
4011 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0]
4018 ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4030 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11]
4031 ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
4034 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0]
4039 ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4050 ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4053 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0]
4059 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12]
4060 ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
4072 ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4075 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0]
4079 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13]
4080 ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4089 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6]
4090 ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
4093 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6]
4094 ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
4099 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25
4103 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
4106 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7]
4107 ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
4109 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7]
4110 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
4113 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9
4117 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
4136 ; AVX512DQ-BW-FCP: # %bb.0:
4137 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4150 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10]
4151 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
4154 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0]
4161 ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4173 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11]
4174 ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
4177 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0]
4182 ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4193 ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4196 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0]
4202 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12]
4203 ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
4215 ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4218 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0]
4222 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13]
4223 ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4232 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6]
4233 ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
4236 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6]
4237 ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
4242 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25
4246 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
4249 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7]
4250 ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
4252 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7]
4253 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
4256 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9
4260 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
4278 %strided.vec0 = shufflevector <96 x i64> %wide.vec, <96 x i64> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90>
4295 ; SSE: # %bb.0:
4296 ; SSE-NEXT: subq $1176, %rsp # imm = 0x498
4313 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
4314 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4316 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4318 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0]
4319 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4321 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4323 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
4324 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4326 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4328 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
4329 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4331 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4333 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
4334 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4336 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4338 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
4339 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4341 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4343 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4344 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4346 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4349 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4350 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4352 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4356 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4357 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4359 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4363 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4364 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4366 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4370 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4371 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4373 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4377 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4378 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4380 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4384 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4385 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4387 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4391 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4392 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4394 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4398 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4399 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4401 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4405 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4406 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4408 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4412 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4413 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4415 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4419 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4420 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4422 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4426 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4427 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4429 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4433 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4434 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4436 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4440 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4441 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4443 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4447 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4448 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4450 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4454 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4455 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4457 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4461 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4462 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4464 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4468 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4469 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4471 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4475 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4476 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4478 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4482 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4483 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4485 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4489 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4490 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4492 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4496 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4497 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4499 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4503 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4504 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4506 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4510 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4511 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4513 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4517 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4518 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4520 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4524 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4525 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4527 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4531 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4532 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4534 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4538 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4539 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4541 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4545 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4546 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4548 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4552 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4553 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4555 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4559 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4560 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4566 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4567 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4569 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4573 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4574 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4579 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4580 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4585 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4586 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4591 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0]
4596 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
4601 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
4606 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
4611 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
4616 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4618 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4620 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4622 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4624 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4626 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4628 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4630 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4632 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4634 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4636 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4638 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4640 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4642 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4644 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4646 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4648 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4650 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4652 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4654 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4656 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4658 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4660 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4662 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4664 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4666 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4668 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4670 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4672 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4674 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4682 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4686 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4690 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4692 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4694 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4696 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4698 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4700 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4702 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4704 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4706 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4708 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4710 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4712 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4714 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4716 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4718 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4720 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4722 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4724 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4726 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4728 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4730 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4732 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4734 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4736 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4738 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4740 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4742 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4744 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4752 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4754 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4756 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4758 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4760 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4762 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4764 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4766 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4768 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4770 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4772 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4782 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4786 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4788 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4790 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4792 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4794 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4796 ; SSE-NEXT: addq $1176, %rsp # imm = 0x498
4800 ; AVX: # %bb.0:
4801 ; AVX-NEXT: subq $1624, %rsp # imm = 0x658
4803 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4805 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4807 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4809 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4810 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
4812 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4814 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4815 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
4816 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4817 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4819 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4820 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
4822 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4824 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4825 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
4826 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4827 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4829 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4830 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
4832 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4834 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4835 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4836 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4837 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4839 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4841 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4842 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
4844 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4846 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4847 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4848 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4849 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4851 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4854 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
4857 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0]
4858 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4859 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4862 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
4865 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0]
4866 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4867 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4870 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
4873 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0]
4874 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4875 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4878 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
4881 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
4882 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
4883 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4884 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4885 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4887 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4888 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4890 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
4891 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4892 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4893 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4895 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4896 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4898 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
4899 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4900 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4901 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4903 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4904 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4906 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
4907 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4908 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4909 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4911 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4912 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4914 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
4915 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4917 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4920 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
4921 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4924 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4925 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4928 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
4929 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4932 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4933 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4937 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4938 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4940 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4942 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4943 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4944 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4945 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4947 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4949 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4950 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4952 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4954 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4955 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4956 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4957 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4959 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4961 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4962 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4964 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4966 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4967 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4968 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4969 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4971 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4973 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4974 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4977 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm14[0]
4978 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4979 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4981 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4983 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4985 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4986 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4988 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4989 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
4990 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4991 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4994 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
4997 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0]
4998 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4999 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5002 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
5005 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0]
5006 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
5007 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5010 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm13[0],ymm3[2],ymm13[2]
5013 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
5014 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5015 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5017 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5019 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5020 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5022 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5023 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5024 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5025 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5027 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5028 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5030 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5032 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5033 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5035 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5036 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5038 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5039 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5040 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5041 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5044 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
5045 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5048 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5049 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5052 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5053 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5056 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5057 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5058 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5059 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5061 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5062 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5064 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5065 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5067 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5069 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5071 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5072 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5074 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5075 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
5076 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5077 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5079 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5081 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5082 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5084 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5086 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5087 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
5088 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5089 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5091 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5093 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5094 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5096 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5098 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5099 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
5100 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5101 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5103 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5105 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5106 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5108 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5110 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5111 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
5112 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5113 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5115 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5117 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5118 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5121 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0]
5122 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5123 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5126 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
5129 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0]
5130 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5131 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5134 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
5137 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0]
5138 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5139 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5142 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
5145 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
5146 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5147 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5148 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5149 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5151 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5152 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5154 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5155 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5156 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5157 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5159 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5160 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5162 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5163 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5164 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5165 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5167 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5168 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5170 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5171 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5172 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5173 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5175 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5176 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5178 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5179 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5180 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5183 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7]
5186 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5189 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
5192 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
5193 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5195 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5197 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5199 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5201 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5203 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5205 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5207 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5209 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5211 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5213 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5215 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5217 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5219 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5221 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5223 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5225 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5227 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5229 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5231 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5233 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5235 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5237 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5239 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5241 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5243 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5245 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5247 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5249 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5251 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5255 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5257 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5259 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5261 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5263 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5265 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5267 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5269 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5271 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5273 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5279 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5281 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5283 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5285 ; AVX-NEXT: addq $1624, %rsp # imm = 0x658
5290 ; AVX2: # %bb.0:
5291 ; AVX2-NEXT: subq $1496, %rsp # imm = 0x5D8
5293 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5295 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5297 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5301 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5303 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5305 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5307 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0]
5308 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
5309 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
5310 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5311 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5313 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5315 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5316 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5317 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
5318 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
5319 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5320 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5322 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5324 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5325 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5326 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
5327 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
5328 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5329 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5331 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5333 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5334 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5335 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5337 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5339 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0]
5340 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5341 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5344 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5345 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
5346 ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5347 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5350 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0]
5351 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
5352 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5355 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5356 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
5357 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5358 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5361 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0]
5362 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
5363 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5366 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5367 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
5368 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5369 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5372 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0]
5373 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
5374 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5377 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5378 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
5379 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5380 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5383 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0]
5384 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5385 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5386 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload
5389 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5391 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
5392 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5393 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5394 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5397 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5399 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
5400 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5401 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5402 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5405 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5407 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
5408 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5409 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload
5412 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5414 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
5415 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5419 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5420 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5424 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5425 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5429 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5430 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5434 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5435 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5437 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5438 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5441 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0]
5442 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5443 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5446 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5449 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0]
5450 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5451 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5453 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5454 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5457 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0]
5458 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5459 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5461 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5462 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5465 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0]
5466 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5467 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5469 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5470 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5472 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5474 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5475 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0]
5476 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7]
5477 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5479 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5480 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5482 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5484 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5485 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0]
5486 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7]
5487 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5489 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5490 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2]
5493 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0]
5494 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7]
5495 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5497 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5498 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2]
5501 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0]
5502 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
5503 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5506 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5508 ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5509 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
5510 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5511 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5516 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5517 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
5518 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5522 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5524 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5525 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
5526 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5527 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5530 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5531 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5533 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
5534 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5535 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5539 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5540 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
5541 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5542 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5547 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5548 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
5549 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5550 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5551 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5552 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5555 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5557 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5558 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
5559 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5560 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5561 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5562 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5565 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5567 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
5568 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5569 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5570 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5571 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
5572 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5574 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5576 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5577 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
5578 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5579 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5580 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5581 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
5582 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5584 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5586 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5587 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
5588 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5589 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5590 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5591 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
5592 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5595 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0]
5596 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5597 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5598 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5599 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
5600 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5603 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0]
5604 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5605 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5606 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5607 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
5608 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5611 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0]
5612 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5613 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5614 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5615 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
5616 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5619 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0]
5620 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5621 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5622 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5623 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
5624 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5627 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0]
5628 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5629 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5630 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5631 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5632 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
5633 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5636 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
5637 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
5638 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5641 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5642 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
5644 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5645 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5647 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5649 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5650 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
5652 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5654 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5657 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5659 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5662 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5664 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5667 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
5669 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5672 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
5674 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5677 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5681 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5682 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5684 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5686 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5688 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5690 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5692 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5694 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5696 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5698 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5700 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5702 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5704 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5706 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5708 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5710 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5712 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5714 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5716 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5718 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5720 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5722 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5724 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5726 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5728 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5730 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5732 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5734 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5736 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5738 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5740 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5744 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5746 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5748 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5750 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5752 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5754 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5756 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5758 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5760 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5762 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5770 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5772 ; AVX2-NEXT: addq $1496, %rsp # imm = 0x5D8
5777 ; AVX2-FP: # %bb.0:
5778 ; AVX2-FP-NEXT: subq $1496, %rsp # imm = 0x5D8
5780 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5782 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5784 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5788 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5790 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5792 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5794 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0]
5795 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
5796 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
5797 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5798 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5800 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5802 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5803 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5804 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
5805 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
5806 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5807 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5809 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5811 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5812 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5813 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
5814 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
5815 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5816 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5818 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5820 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5821 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5822 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5824 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5826 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0]
5827 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5828 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5831 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5832 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
5833 ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5834 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5837 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0]
5838 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
5839 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5842 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5843 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
5844 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5845 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5848 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0]
5849 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
5850 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5853 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5854 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
5855 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5856 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5859 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0]
5860 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
5861 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5864 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5865 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
5866 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5867 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
5870 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0]
5871 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5872 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5873 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload
5876 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5878 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
5879 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5880 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5881 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5884 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5886 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
5887 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5888 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5889 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5892 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5894 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
5895 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5896 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload
5899 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5901 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
5902 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5906 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5907 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5911 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5912 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5916 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5917 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5921 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5922 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5924 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5925 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5928 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0]
5929 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5930 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5933 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5936 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0]
5937 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5938 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5940 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5941 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5944 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0]
5945 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5946 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5948 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5949 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5952 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0]
5953 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5954 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5956 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5957 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5959 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5961 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5962 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0]
5963 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7]
5964 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5966 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5967 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5969 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5971 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5972 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0]
5973 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7]
5974 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5976 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5977 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2]
5980 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0]
5981 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7]
5982 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5984 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5985 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2]
5988 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0]
5989 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
5990 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5993 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5995 ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5996 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
5997 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5998 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6003 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6004 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
6005 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6009 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6011 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6012 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
6013 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6014 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6017 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6018 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6020 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
6021 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6022 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6026 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6027 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
6028 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6029 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6034 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6035 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
6036 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6037 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6038 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6039 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6042 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6044 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6045 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
6046 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6047 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6048 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6049 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6052 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6054 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
6055 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6056 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6057 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6058 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
6059 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6061 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6063 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6064 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
6065 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6066 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6067 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6068 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
6069 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6071 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6073 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6074 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
6075 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6076 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6077 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6078 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
6079 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6082 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0]
6083 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6084 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6085 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6086 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
6087 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6090 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0]
6091 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6092 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6093 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6094 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
6095 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6098 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0]
6099 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6100 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6101 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6102 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
6103 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6106 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0]
6107 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6108 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6109 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6110 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
6111 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6114 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0]
6115 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6116 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6117 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6118 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6119 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
6120 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6123 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
6124 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6125 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6128 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6129 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
6131 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6132 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6134 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6136 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6137 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
6139 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6141 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6144 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6146 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6149 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6151 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6154 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
6156 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6159 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6161 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6164 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6168 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6169 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6171 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6173 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6175 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6177 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6179 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6181 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6183 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6185 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6187 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6189 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6191 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6193 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6195 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6197 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6199 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6201 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6203 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6205 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6207 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6209 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6211 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6213 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6215 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6217 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6219 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6221 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6223 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6225 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6227 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6231 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6233 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6235 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6237 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6239 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6241 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6243 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6245 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6247 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6249 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6257 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6259 ; AVX2-FP-NEXT: addq $1496, %rsp # imm = 0x5D8
6264 ; AVX2-FCP: # %bb.0:
6265 ; AVX2-FCP-NEXT: subq $1496, %rsp # imm = 0x5D8
6267 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6269 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6271 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6275 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6277 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6279 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6281 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0]
6282 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
6283 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
6284 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6285 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6287 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6289 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6290 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6291 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
6292 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
6293 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6294 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6296 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6298 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6299 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6300 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
6301 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
6302 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6303 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6305 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6307 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6308 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6309 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6311 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6313 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0]
6314 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6315 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6318 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6319 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
6320 ; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6321 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6324 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0]
6325 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
6326 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6329 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6330 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
6331 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6332 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6335 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0]
6336 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
6337 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6340 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6341 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
6342 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6343 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6346 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0]
6347 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
6348 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6351 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6352 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
6353 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6354 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6357 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0]
6358 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6359 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6360 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload
6363 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
6365 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
6366 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6367 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6368 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6371 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
6373 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
6374 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6375 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6376 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6379 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
6381 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
6382 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6383 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload
6386 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
6388 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
6389 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6393 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6394 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6398 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6399 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6403 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6404 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6408 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6409 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6411 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6412 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6415 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0]
6416 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6417 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6420 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6423 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0]
6424 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6425 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6427 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6428 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6431 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0]
6432 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6433 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6435 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6436 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6439 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0]
6440 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6441 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6443 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6444 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6446 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6448 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6449 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0]
6450 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7]
6451 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6453 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6454 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6456 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6458 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6459 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0]
6460 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7]
6461 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6463 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6464 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2]
6467 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0]
6468 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7]
6469 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6471 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6472 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2]
6475 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0]
6476 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
6477 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6480 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6482 ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6483 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
6484 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6485 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6490 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6491 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
6492 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6496 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6498 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6499 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
6500 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6501 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6504 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6505 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6507 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
6508 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6509 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6513 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6514 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
6515 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6516 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6521 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6522 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
6523 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6524 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6525 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6526 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6529 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6531 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6532 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
6533 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6534 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6535 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6536 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6539 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6541 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
6542 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6543 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6544 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6545 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
6546 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6548 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6550 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6551 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
6552 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6553 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6554 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6555 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
6556 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6558 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6560 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6561 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
6562 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6563 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6564 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6565 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
6566 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6569 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0]
6570 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6571 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6572 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6573 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
6574 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6577 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0]
6578 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6579 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6580 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6581 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
6582 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6585 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0]
6586 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6587 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6588 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6589 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
6590 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6593 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0]
6594 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6595 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6596 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6597 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
6598 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6601 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0]
6602 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6603 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6604 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6605 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6606 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
6607 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
6610 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
6611 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6612 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6615 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6616 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
6618 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6619 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6621 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6623 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6624 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
6626 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6628 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6631 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6633 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6636 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6638 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6641 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
6643 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6646 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6648 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6651 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6655 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6656 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6658 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6660 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6662 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6664 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6666 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6668 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6670 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6672 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6674 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6676 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6678 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6680 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6682 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6684 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6686 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6688 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6690 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6692 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6694 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6696 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6698 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6700 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6702 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6704 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6706 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6708 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6710 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6712 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6714 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6718 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6720 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6722 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6724 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6726 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6728 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6730 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6732 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6734 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6736 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6744 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6746 ; AVX2-FCP-NEXT: addq $1496, %rsp # imm = 0x5D8
6751 ; AVX512: # %bb.0:
6752 ; AVX512-NEXT: subq $2632, %rsp # imm = 0xA48
6763 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10]
6764 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
6767 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6768 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0]
6771 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6774 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6777 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6779 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6780 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11]
6781 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
6784 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6787 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0]
6790 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6793 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6795 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6797 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6800 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6803 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6806 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6808 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6810 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6813 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6816 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6819 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6821 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6822 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6]
6823 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6826 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6827 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7]
6828 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
6830 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6833 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6835 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6838 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6840 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6842 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6845 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6847 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6848 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0]
6851 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6852 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0]
6855 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6857 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6860 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6862 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6864 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6872 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6875 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6878 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6881 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6887 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6892 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6897 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6906 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6914 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6923 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6935 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
6936 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
6939 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
6940 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
6943 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6944 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
6945 ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
6948 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6949 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
6950 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
6972 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6976 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6978 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6979 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
6981 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
6983 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6984 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6986 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6989 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
6991 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
6994 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6997 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6998 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
7001 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7002 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
7005 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7006 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
7012 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7017 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7018 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
7022 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7023 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
7027 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7030 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7033 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7034 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7036 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7039 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7040 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7044 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7047 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7048 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
7050 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7051 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
7052 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7054 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7055 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
7057 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
7058 ; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5
7060 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7061 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
7063 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
7064 ; AVX512-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10
7066 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
7067 ; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4
7069 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
7070 ; AVX512-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6
7074 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
7093 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
7098 ; AVX512-NEXT: addq $2632, %rsp # imm = 0xA48
7103 ; AVX512-FCP: # %bb.0:
7104 ; AVX512-FCP-NEXT: subq $2632, %rsp # imm = 0xA48
7115 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10]
7116 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7119 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7120 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0]
7123 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7126 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7129 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7131 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7132 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11]
7133 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7136 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7139 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0]
7142 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7145 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7147 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7149 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7152 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7155 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7158 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7160 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7162 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7165 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7168 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7171 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7173 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7174 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6]
7175 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7178 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7179 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7]
7180 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
7182 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7185 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7187 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7190 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7192 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7194 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7197 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7199 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7200 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0]
7203 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7204 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0]
7207 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7209 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7212 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7214 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7216 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7224 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7227 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7230 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7233 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7239 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7244 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7249 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7258 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7266 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7275 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7287 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
7288 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
7291 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
7292 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7295 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7296 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
7297 ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
7300 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7301 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
7302 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
7324 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7328 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7330 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7331 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
7333 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
7335 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7336 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7338 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7341 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
7343 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
7346 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7349 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7350 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
7353 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7354 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
7357 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7358 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
7364 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7369 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7370 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
7374 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7375 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
7379 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7382 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7385 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7386 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7388 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7391 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7392 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7396 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7399 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7400 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
7402 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7403 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
7404 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7406 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7407 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
7409 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
7410 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5
7412 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7413 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
7415 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
7416 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10
7418 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
7419 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4
7421 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
7422 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6
7426 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
7445 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7450 ; AVX512-FCP-NEXT: addq $2632, %rsp # imm = 0xA48
7455 ; AVX512DQ: # %bb.0:
7456 ; AVX512DQ-NEXT: subq $2632, %rsp # imm = 0xA48
7467 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10]
7468 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7471 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7472 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0]
7475 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7478 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7481 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7483 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7484 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11]
7485 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7488 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7491 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0]
7494 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7497 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7499 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7501 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7504 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7507 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7510 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7512 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7514 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7517 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7520 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7523 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7525 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7526 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6]
7527 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7530 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7531 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7]
7532 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
7534 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7537 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7539 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7542 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7544 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7546 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7549 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7551 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7552 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0]
7555 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7556 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0]
7559 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7561 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7564 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7566 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7568 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7576 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7579 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7582 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7585 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7591 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7596 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7601 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7610 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7618 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7627 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7639 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
7640 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
7643 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
7644 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7647 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7648 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
7649 ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
7652 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7653 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
7654 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
7676 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7680 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7682 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7683 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
7685 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
7687 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7688 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7690 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7693 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
7695 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
7698 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7701 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7702 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
7705 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7706 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
7709 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7710 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
7716 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7721 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7722 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
7726 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7727 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
7731 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7734 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7737 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7738 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7740 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7743 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7744 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7748 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7751 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7752 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
7754 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7755 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
7756 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7758 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7759 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
7761 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
7762 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5
7764 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7765 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
7767 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
7768 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10
7770 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
7771 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4
7773 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
7774 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6
7778 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
7797 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
7802 ; AVX512DQ-NEXT: addq $2632, %rsp # imm = 0xA48
7807 ; AVX512DQ-FCP: # %bb.0:
7808 ; AVX512DQ-FCP-NEXT: subq $2632, %rsp # imm = 0xA48
7819 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10]
7820 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7823 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7824 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0]
7827 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7830 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7833 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7835 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7836 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11]
7837 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7840 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7843 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0]
7846 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7849 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7851 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7853 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7856 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7859 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7862 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7864 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7866 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7869 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7872 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7875 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7877 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7878 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6]
7879 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7882 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7883 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7]
7884 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
7886 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7889 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7891 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7894 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7896 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7898 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7901 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7903 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7904 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0]
7907 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7908 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0]
7911 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7913 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7916 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7918 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7920 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7928 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7931 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7934 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7937 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7943 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7948 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7953 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7962 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7970 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7979 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7991 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
7992 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
7995 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
7996 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7999 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8000 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
8001 ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
8004 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8005 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
8006 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
8028 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8032 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8034 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8035 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8037 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8039 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8040 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8042 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8045 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8047 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8050 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8053 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8054 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
8057 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8058 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
8061 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8062 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
8068 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8073 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8074 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8078 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8079 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8083 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8086 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8089 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8090 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8092 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8095 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8096 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8100 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8103 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8104 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
8106 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8107 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
8108 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8110 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8111 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
8113 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
8114 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5
8116 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8117 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
8119 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
8120 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10
8122 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8123 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4
8125 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8126 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6
8130 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
8149 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8154 ; AVX512DQ-FCP-NEXT: addq $2632, %rsp # imm = 0xA48
8159 ; AVX512BW: # %bb.0:
8160 ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48
8171 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10]
8172 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8175 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8176 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0]
8179 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8182 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8185 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8187 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8188 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11]
8189 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8192 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8195 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0]
8198 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8201 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8203 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8205 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8208 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8211 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8214 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8216 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8218 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8221 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8224 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8227 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8229 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8230 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6]
8231 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8234 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8235 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7]
8236 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
8238 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8241 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8243 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8246 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8248 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8250 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8253 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8255 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8256 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0]
8259 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8260 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0]
8263 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8265 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8268 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8270 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8272 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8280 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8283 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8286 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8289 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8295 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8300 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8305 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8314 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8322 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8331 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8343 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
8344 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
8347 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
8348 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8351 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8352 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
8353 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
8356 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8357 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
8358 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
8380 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8384 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8386 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8387 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8389 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8391 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8392 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8394 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8397 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8399 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8402 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8405 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8406 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
8409 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8410 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
8413 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8414 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
8420 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8425 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8426 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8430 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8431 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8435 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8438 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8441 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8442 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8444 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8447 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8448 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8452 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8455 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8456 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
8458 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8459 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
8460 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8462 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8463 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
8465 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
8466 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5
8468 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8469 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
8471 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
8472 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10
8474 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8475 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4
8477 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8478 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6
8482 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
8501 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
8506 ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48
8511 ; AVX512BW-FCP: # %bb.0:
8512 ; AVX512BW-FCP-NEXT: subq $2632, %rsp # imm = 0xA48
8523 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10]
8524 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8527 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8528 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0]
8531 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8534 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8537 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8539 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8540 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11]
8541 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8544 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8547 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0]
8550 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8553 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8555 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8557 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8560 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8563 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8566 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8568 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8570 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8573 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8576 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8579 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8581 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8582 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6]
8583 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8586 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8587 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7]
8588 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
8590 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8593 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8595 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8598 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8600 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8602 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8605 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8607 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8608 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0]
8611 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8612 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0]
8615 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8617 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8620 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8622 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8624 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8632 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8635 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8638 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8641 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8647 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8652 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8657 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8666 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8674 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8683 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8695 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
8696 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
8699 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
8700 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8703 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8704 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
8705 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
8708 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8709 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
8710 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
8732 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8736 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8738 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8739 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8741 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8743 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8744 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8746 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8749 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8751 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
8754 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8757 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8758 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
8761 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8762 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
8765 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8766 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
8772 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8777 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8778 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8782 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8783 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8787 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8790 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8793 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8794 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8796 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8799 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8800 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8804 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8807 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8808 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
8810 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8811 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
8812 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8814 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8815 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
8817 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
8818 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5
8820 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8821 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
8823 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
8824 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10
8826 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8827 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4
8829 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8830 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6
8834 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
8853 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8858 ; AVX512BW-FCP-NEXT: addq $2632, %rsp # imm = 0xA48
8863 ; AVX512DQ-BW: # %bb.0:
8864 ; AVX512DQ-BW-NEXT: subq $2632, %rsp # imm = 0xA48
8875 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10]
8876 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8879 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8880 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0]
8883 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8886 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8889 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8891 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8892 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11]
8893 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8896 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8899 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0]
8902 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8905 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8907 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8909 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8912 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8915 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8918 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8920 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8922 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8925 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8928 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8931 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8933 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8934 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6]
8935 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8938 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8939 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7]
8940 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
8942 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8945 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8947 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8950 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8952 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8954 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8957 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8959 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8960 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0]
8963 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8964 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0]
8967 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8969 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8972 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8974 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8976 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8984 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8987 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8990 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8993 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8999 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9004 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9009 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9018 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9026 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9035 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9047 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
9048 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9051 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
9052 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9055 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9056 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
9057 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
9060 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9061 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
9062 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
9084 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9088 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9090 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9091 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9093 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9095 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9096 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9098 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9101 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9103 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9106 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9109 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9110 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
9113 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9114 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9117 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9118 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
9124 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9129 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9130 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
9134 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9135 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
9139 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9142 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9145 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9146 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9148 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9151 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9152 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9156 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9159 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9160 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
9162 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9163 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
9164 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9166 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9167 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
9169 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
9170 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5
9172 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9173 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
9175 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
9176 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10
9178 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9179 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4
9181 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9182 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6
9186 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9205 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
9210 ; AVX512DQ-BW-NEXT: addq $2632, %rsp # imm = 0xA48
9215 ; AVX512DQ-BW-FCP: # %bb.0:
9216 ; AVX512DQ-BW-FCP-NEXT: subq $2632, %rsp # imm = 0xA48
9227 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10]
9228 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
9231 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9232 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0]
9235 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9238 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9241 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9243 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9244 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11]
9245 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
9248 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9251 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0]
9254 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9257 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9259 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9261 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9264 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9267 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9270 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9272 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9274 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9277 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9280 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9283 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9285 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9286 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6]
9287 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
9290 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9291 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7]
9292 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9294 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9297 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9299 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9302 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9304 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9306 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9309 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9311 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9312 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0]
9315 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9316 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0]
9319 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9321 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9324 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9326 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9328 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9336 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9339 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9342 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9345 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9351 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9356 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9361 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9370 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9378 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9387 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9399 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
9400 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9403 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
9404 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9407 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9408 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
9409 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
9412 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9413 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
9414 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
9436 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9440 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9442 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9443 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9445 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9447 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9448 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9450 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9453 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9455 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9458 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9461 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9462 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
9465 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9466 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9469 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9470 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
9476 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9481 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9482 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
9486 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9487 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
9491 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9494 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9497 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9498 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9500 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9503 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9504 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9508 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9511 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9512 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
9514 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9515 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
9516 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9518 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9519 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
9521 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
9522 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5
9524 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9525 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
9527 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
9528 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10
9530 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9531 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4
9533 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9534 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6
9538 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9557 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9562 ; AVX512DQ-BW-FCP-NEXT: addq $2632, %rsp # imm = 0xA48
9566 %strided.vec0 = shufflevector <192 x i64> %wide.vec, <192 x i64> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186>
9583 ; SSE: # %bb.0:
9584 ; SSE-NEXT: subq $2712, %rsp # imm = 0xA98
9601 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
9602 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9604 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9606 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
9607 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9609 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9611 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
9612 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9614 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9616 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
9617 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9619 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9621 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
9622 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9624 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9626 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
9627 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9629 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9631 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
9632 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9634 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9637 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9638 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9640 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9644 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9645 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9647 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9651 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9652 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9654 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9658 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9659 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9661 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9665 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9666 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9668 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9672 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9673 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9675 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9679 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9680 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9682 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9686 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9687 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9689 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9693 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9694 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9696 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9700 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9701 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9703 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9707 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9708 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9710 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9714 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9715 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9717 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9721 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9722 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9724 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9728 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9729 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9731 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9735 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9736 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9738 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9742 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9743 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9745 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9749 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9750 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9752 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9756 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9757 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9759 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9763 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9764 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9766 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9770 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9771 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9773 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9777 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9778 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9780 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9784 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9785 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9787 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9791 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9792 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9794 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9798 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9799 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9801 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9805 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9806 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9808 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9812 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9813 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9815 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9819 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9820 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9822 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9826 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9827 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9829 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9833 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9834 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9836 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9840 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9841 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9843 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9847 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9848 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9850 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9854 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9855 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9857 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9861 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9862 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9864 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9868 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9869 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9871 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9875 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9876 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9878 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9882 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9883 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9885 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9889 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9890 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9892 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9896 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9897 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9899 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9903 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9904 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9906 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9910 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9911 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9913 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9917 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9918 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9920 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9924 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9925 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9927 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9931 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9932 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9934 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9938 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9939 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9941 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9945 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9946 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9948 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9952 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9953 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9955 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9959 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9960 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9962 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9966 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9967 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9969 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9973 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9974 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9976 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9980 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9981 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9983 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9987 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9988 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9990 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9994 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9995 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9997 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10001 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10002 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10004 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10008 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10009 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10011 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10015 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10016 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10018 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10022 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10023 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10025 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10029 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10030 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10032 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10036 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10037 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10039 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10043 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10044 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10046 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10050 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10051 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10053 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10057 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10058 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10060 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10064 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10065 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10067 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10071 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10072 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10074 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10078 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10079 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10081 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10085 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10086 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10088 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10092 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10093 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10095 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10099 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10100 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10102 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10106 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10107 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10109 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10113 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10114 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10116 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10120 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10121 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10123 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10127 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10128 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10130 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10134 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10135 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10137 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10141 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10142 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10144 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10148 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10149 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10151 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10155 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10156 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10158 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10162 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10163 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10165 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10169 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10170 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10172 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10176 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10177 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10179 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10183 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10184 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10186 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10190 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10191 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10197 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
10198 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10203 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
10204 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10209 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
10210 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10215 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0]
10220 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
10225 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
10230 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
10235 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
10240 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
10242 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10244 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10246 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10248 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10250 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10252 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10254 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10256 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10258 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10260 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10262 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10264 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10266 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10268 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10270 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10272 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10274 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10276 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10278 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10280 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10282 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10284 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10286 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10288 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10290 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10292 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10294 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10296 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10298 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10300 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10302 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10304 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10306 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10308 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10310 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10312 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10314 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10316 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10318 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10320 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10322 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10324 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10326 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10328 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10330 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10332 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10334 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10336 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10338 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10340 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10342 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10344 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10346 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10348 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10350 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10352 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10354 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10356 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10358 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10360 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10362 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10364 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10366 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10368 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10370 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10372 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10374 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10376 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10378 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10380 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10382 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10384 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10386 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10388 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10390 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10392 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10394 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10396 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10398 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10400 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10402 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10404 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10406 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10408 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10410 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10412 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10414 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10416 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10418 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10420 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10422 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10424 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10426 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10428 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10430 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10432 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10434 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10436 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10438 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10440 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10442 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10444 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10446 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10448 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10450 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10452 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10454 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10456 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10458 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10460 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10462 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10464 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10466 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10468 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10470 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10472 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10474 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10476 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10478 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10480 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10482 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10484 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10486 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10488 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10490 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10492 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10494 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10496 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10504 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10506 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10508 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10510 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10512 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10514 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10516 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10518 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10520 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10522 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10524 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10526 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10528 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10530 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10532 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10534 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10536 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10538 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10540 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10542 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10544 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10546 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10548 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10550 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10552 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10554 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10556 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
10568 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10570 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10572 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10574 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10576 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10578 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10580 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10582 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10584 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10586 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10588 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10590 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10592 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10594 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10596 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10598 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10600 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10602 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10604 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10606 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10608 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10610 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10612 ; SSE-NEXT: addq $2712, %rsp # imm = 0xA98
10616 ; AVX: # %bb.0:
10617 ; AVX-NEXT: subq $3768, %rsp # imm = 0xEB8
10619 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10621 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10623 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10625 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10626 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10628 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10630 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10631 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
10632 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10633 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10635 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10636 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
10638 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10640 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10641 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
10642 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10643 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10645 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10646 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
10648 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10650 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10651 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10652 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10653 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10655 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10657 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10658 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10660 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10662 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10663 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10664 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10665 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10667 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10669 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10670 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10672 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10674 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10675 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10676 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10677 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10679 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10681 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10682 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10684 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10686 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10687 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10688 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10689 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10691 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10693 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10694 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10696 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10698 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10699 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10700 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10701 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10703 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10705 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10706 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10708 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10710 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10711 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10712 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10713 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10715 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10717 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10718 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10720 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10722 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10723 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
10724 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10725 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10727 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10729 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10730 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10732 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10734 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10735 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10736 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10737 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10739 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10741 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10742 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10744 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10746 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10747 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10748 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10749 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10751 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10753 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10754 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10756 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10758 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10759 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10760 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10761 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10763 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10765 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10766 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10769 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0]
10770 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10771 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10774 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
10777 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0]
10778 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10779 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10782 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
10785 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0]
10786 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10787 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10790 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
10793 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
10794 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10795 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10796 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10797 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10799 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10800 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10802 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10803 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10804 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10805 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10807 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10808 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10810 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10811 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10812 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10813 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10815 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10816 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10818 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10819 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10820 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10821 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10823 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10824 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10826 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10827 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10828 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10829 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10831 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10832 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10834 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10835 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10836 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10837 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10839 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10840 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10842 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10843 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10844 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10845 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10847 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10848 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10850 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10851 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10852 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10853 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10855 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10856 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10858 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10859 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10860 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10861 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10863 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10864 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10866 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10867 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10868 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10869 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10871 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10872 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10874 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10875 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10876 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10877 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10879 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10880 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10882 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10883 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10884 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10885 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10887 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10888 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10890 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
10891 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10892 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10893 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10896 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
10897 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10900 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
10901 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10904 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
10905 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10908 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10909 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10911 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10913 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10914 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10916 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10918 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10919 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10920 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10921 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10923 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10925 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10926 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10928 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10930 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10931 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10932 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10933 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10935 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10937 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10938 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10940 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10942 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10943 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10944 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10945 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10947 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10949 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10950 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10952 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10954 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10955 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10956 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10957 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10959 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10961 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10962 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10964 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10966 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10967 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10968 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10969 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10971 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10973 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10974 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10976 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10978 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10979 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10980 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10981 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10983 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10985 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10986 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10988 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10990 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10991 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10992 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10993 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10995 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10997 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10998 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11000 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11002 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11003 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11004 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11005 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11007 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11009 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11010 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11012 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11014 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11015 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11016 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11017 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11019 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11021 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11022 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11024 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11027 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11028 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11029 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11031 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11033 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11034 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11036 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11038 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11039 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11040 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11041 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11043 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11045 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11046 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11048 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11050 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11051 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11052 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11053 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11055 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11057 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11058 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11061 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0]
11062 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11063 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11066 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
11069 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm8[0]
11070 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11071 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11074 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm15[0],ymm6[2],ymm15[2]
11077 ; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0]
11078 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
11079 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11083 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
11085 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm0[0]
11086 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7]
11087 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11090 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11091 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11092 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11093 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11095 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11096 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11098 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11099 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11102 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11103 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11104 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11105 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11107 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11108 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11110 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11111 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11114 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11115 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11116 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11117 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11119 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11120 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11122 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11123 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11124 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11125 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11128 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11129 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11130 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11131 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11133 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11134 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11136 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11137 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11138 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11139 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11141 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11142 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11144 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11145 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11146 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11147 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11149 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11150 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11152 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11153 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11154 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11155 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11157 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11158 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11160 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11161 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11162 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11163 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11165 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11166 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11168 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11169 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11170 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11171 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11174 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11176 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11177 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11178 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11179 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11181 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11182 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11184 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11185 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11186 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11187 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11189 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11190 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11192 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11193 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11194 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11195 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11197 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11198 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11200 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11201 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11203 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11205 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11207 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11208 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11210 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11211 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
11212 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11213 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11215 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11217 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11218 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11222 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11223 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11224 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11225 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11227 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11229 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11230 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11232 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11234 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11235 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11236 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11237 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11239 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11241 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11242 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11244 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11246 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11247 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11248 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11249 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11251 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11253 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11254 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11256 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11258 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11259 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11260 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11261 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11263 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11265 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11266 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11268 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11270 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11271 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11272 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11273 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11275 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11277 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11278 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11280 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11282 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11283 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11284 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11285 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11287 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11289 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11290 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11292 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11294 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11295 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11296 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11297 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11299 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11301 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11302 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11304 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11306 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11307 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11308 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11309 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11311 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11313 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11314 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11316 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11318 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11319 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11320 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11321 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11323 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11325 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11326 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11328 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11330 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11331 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11332 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11333 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11335 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11337 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11338 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11340 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11342 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11343 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11344 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11345 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11347 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11349 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11350 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11353 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0]
11354 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11355 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11358 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
11361 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0]
11362 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11363 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11366 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
11369 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0]
11370 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11371 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11374 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
11377 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
11378 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11379 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11380 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11381 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11383 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11384 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11386 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11387 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11388 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11389 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11391 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11394 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11395 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11396 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11397 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11399 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11400 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11402 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11403 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11404 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11405 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11407 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11408 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11410 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11411 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11412 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11413 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11415 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11416 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11418 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11419 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11420 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11421 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11423 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11424 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11426 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11427 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11428 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11429 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11431 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11432 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11434 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11435 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11436 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11437 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11439 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11440 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11442 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11444 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11445 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11447 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11448 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11450 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11451 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11452 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11453 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11455 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11456 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11458 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11459 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11460 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11461 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11463 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11464 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11466 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11467 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11468 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11469 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11471 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11472 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
11474 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11475 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11476 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11479 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7]
11482 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
11485 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
11488 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5,6,7]
11489 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11491 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11493 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11495 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11497 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11499 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11501 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11503 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11505 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11507 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11509 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11511 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11513 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11515 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11517 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11519 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11521 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11523 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11525 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11527 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11529 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11531 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11533 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11535 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11537 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11539 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11541 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11543 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11545 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11547 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11549 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11551 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11553 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11555 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11557 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11559 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11561 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11563 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11565 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11567 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11569 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11571 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11573 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11575 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11577 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11579 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11581 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11583 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11585 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11587 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11589 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11591 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11593 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11595 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11597 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11599 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11601 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11603 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11605 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11607 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11609 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11611 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11613 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11615 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11617 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11619 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11621 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11623 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11625 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11627 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11629 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11631 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11633 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11635 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11637 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11639 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11641 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11643 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11645 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11647 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11649 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
11655 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11657 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11659 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11663 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11665 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11667 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11669 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11671 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11673 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11675 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11677 ; AVX-NEXT: addq $3768, %rsp # imm = 0xEB8
11682 ; AVX2: # %bb.0:
11683 ; AVX2-NEXT: subq $3432, %rsp # imm = 0xD68
11685 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11687 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11689 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11691 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11693 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11695 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11697 ; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11699 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11700 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
11701 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
11702 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
11703 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11704 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11706 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11708 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11709 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
11710 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
11711 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
11712 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11713 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11715 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11717 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11718 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
11719 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
11720 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
11721 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11722 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11724 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11726 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11727 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11728 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11730 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11732 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11733 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11734 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11735 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11737 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11739 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11740 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11741 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11743 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11745 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11746 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11747 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11748 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11750 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11752 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11753 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11754 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11756 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11758 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11759 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11760 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11761 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11763 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11765 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11766 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11767 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11769 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11771 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11772 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11773 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11774 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11776 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11778 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11779 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11780 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11782 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11784 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11785 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
11786 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11787 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11789 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11791 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11792 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11793 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11795 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11797 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11798 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
11799 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11800 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11802 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11804 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11805 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11806 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11808 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11810 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0]
11811 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11812 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11814 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11816 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11817 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11818 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11820 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11822 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0]
11823 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11824 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11826 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11828 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11829 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11830 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11832 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11834 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0]
11835 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11836 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11838 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11840 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11841 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11842 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11845 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm14[0]
11846 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11847 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11850 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11851 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
11852 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11853 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11856 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm10[0]
11857 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11858 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11861 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11862 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
11863 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11864 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11867 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0]
11868 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11869 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11872 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11873 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
11874 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11875 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
11878 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
11879 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
11880 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11881 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11882 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11885 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11887 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11888 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11889 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11890 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11893 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11895 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11896 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11897 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11898 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11901 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11903 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11904 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11905 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11906 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11909 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11911 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11912 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11913 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11914 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11917 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11919 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11920 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11921 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11922 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11925 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11927 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11928 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11929 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11930 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11933 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11935 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11936 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11937 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11938 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11941 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11943 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11944 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11945 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11946 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11949 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
11951 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
11952 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11953 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload
11956 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
11958 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
11959 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11960 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload
11963 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
11965 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
11966 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11967 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload
11970 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
11972 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
11973 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11976 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11978 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
11979 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11983 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
11984 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11988 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
11989 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11993 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11994 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11996 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11997 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11999 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12001 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12002 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12003 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12004 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12006 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12007 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12009 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12011 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12012 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12013 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12014 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12016 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12017 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12019 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12021 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12022 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12023 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12024 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12026 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12027 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12029 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12031 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12032 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12033 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12034 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12036 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12037 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12039 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12041 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12042 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12043 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12044 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12046 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12047 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12049 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12051 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12052 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12053 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12054 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12056 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12057 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12059 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12061 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12062 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12063 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12064 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12066 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12067 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12069 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12071 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12072 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12073 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12074 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12076 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12077 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12079 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12082 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12083 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12084 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12086 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12087 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12089 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12091 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12092 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12093 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12094 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12096 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12097 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12099 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12101 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12102 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12103 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12104 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12106 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12107 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12110 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0]
12111 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12112 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12114 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12115 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
12118 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0]
12119 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12120 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12122 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12123 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2]
12126 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm6[0]
12127 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
12128 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12130 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12131 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2]
12134 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm13[0]
12135 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12136 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12138 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12139 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
12142 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0]
12143 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
12144 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12148 ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12149 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12150 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12151 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12152 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12153 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12156 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12158 ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12159 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12160 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12161 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12165 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12166 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12167 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12168 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12169 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12170 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12173 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12175 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12176 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12177 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12178 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12182 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12183 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12184 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12185 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12186 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12187 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12190 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12192 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12193 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12194 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12195 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12199 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12200 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12201 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12202 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12203 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12204 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12207 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12209 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12210 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12211 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12212 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12215 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12217 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12218 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12219 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12220 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12221 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12222 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12225 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12227 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12228 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12229 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12230 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12231 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12232 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12235 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12237 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12238 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12239 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12240 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12241 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12242 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12245 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12246 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12248 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12249 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12250 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12251 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12252 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12255 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12256 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12258 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12259 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12260 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12261 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12262 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12265 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12266 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12268 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12269 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12270 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12272 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12275 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12276 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12278 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12279 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12280 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12281 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12282 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12285 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12287 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12288 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
12289 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12290 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12291 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12292 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
12293 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12295 ; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12297 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12298 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0]
12299 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12300 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12301 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12302 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
12303 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12307 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12308 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0]
12309 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12310 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12311 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12312 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
12313 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12315 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12317 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12318 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
12319 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12320 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12321 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12322 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
12323 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12325 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12327 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12328 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
12329 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12330 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12331 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12332 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
12333 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12335 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12337 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12338 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
12339 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12340 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12341 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12342 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
12343 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12345 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12347 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12348 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
12349 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12350 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12351 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12352 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
12353 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12355 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12357 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12358 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
12359 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12360 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12361 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12362 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
12363 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12365 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12367 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12368 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
12369 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12370 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12371 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12372 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
12373 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12375 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12377 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12378 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
12379 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12380 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12381 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12382 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
12383 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12385 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12387 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0]
12388 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12389 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12390 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12391 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2]
12392 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12394 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12396 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0]
12397 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12398 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12399 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12400 ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12401 ; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
12402 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12404 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12406 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0]
12407 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12408 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12409 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12410 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12411 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
12412 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12415 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0]
12416 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12417 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12418 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12419 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12420 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
12421 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12424 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0]
12425 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12426 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12427 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12428 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12429 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
12430 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12433 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0]
12434 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12435 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12436 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12437 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
12438 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12441 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
12442 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12443 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12445 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12447 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12448 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
12450 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12451 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12453 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12455 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12458 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12459 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12461 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12463 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12464 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
12466 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12467 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12469 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12471 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12472 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
12474 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12475 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12477 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12479 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12480 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
12482 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12483 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12485 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12487 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12488 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
12490 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12491 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12493 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12495 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12496 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
12498 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12499 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12501 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12503 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12504 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
12506 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12507 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12509 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12511 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12512 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
12514 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12516 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12518 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
12520 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7]
12522 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12524 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
12526 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7]
12528 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12530 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
12532 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
12536 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
12540 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12544 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
12546 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12549 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12550 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12552 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12554 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12556 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12558 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12560 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12562 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12564 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12566 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12568 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12570 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12572 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12574 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12576 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12578 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12580 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12582 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12584 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12586 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12588 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12590 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12592 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12594 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12596 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12598 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12600 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12602 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12604 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12606 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12608 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12610 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12612 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12614 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12616 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12618 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12620 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12622 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12624 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12626 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12628 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12630 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12632 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12634 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12636 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12638 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12640 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12642 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12644 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12646 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12648 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12650 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12652 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12654 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12656 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12658 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12660 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12662 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12664 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12666 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12668 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12670 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12672 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12674 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12676 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12678 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12680 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12682 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12684 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12686 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12688 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12690 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12692 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12694 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12696 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12698 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12700 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12702 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12704 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12706 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12708 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12710 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
12719 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12721 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12723 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12725 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12727 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12729 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12731 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12733 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12735 ; AVX2-NEXT: addq $3432, %rsp # imm = 0xD68
12740 ; AVX2-FP: # %bb.0:
12741 ; AVX2-FP-NEXT: subq $3432, %rsp # imm = 0xD68
12743 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12745 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12747 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12749 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12751 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12753 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12755 ; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12757 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12758 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
12759 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
12760 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
12761 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12762 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12764 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12766 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12767 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
12768 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
12769 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
12770 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12771 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12773 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12775 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12776 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
12777 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
12778 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
12779 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12780 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12782 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12784 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12785 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12786 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12788 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12790 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12791 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12792 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12793 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12795 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12797 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12798 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12799 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12801 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12803 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12804 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12805 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12806 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12808 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12810 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12811 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12812 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12814 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12816 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12817 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12818 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12819 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12821 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12823 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12824 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12825 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12827 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12829 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12830 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12831 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12832 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12834 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12836 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12837 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12838 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12840 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12842 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12843 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
12844 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12845 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12847 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12849 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12850 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12851 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12853 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12855 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12856 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
12857 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12858 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12860 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12862 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12863 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12864 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12866 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12868 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0]
12869 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12870 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12872 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12874 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12875 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12876 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12878 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12880 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0]
12881 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12882 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12884 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12886 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12887 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12888 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12890 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12892 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0]
12893 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12894 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12896 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12898 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12899 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12900 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12903 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm14[0]
12904 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12905 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12908 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12909 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
12910 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12911 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12914 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm10[0]
12915 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12916 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12919 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12920 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
12921 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12922 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12925 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0]
12926 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12927 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12930 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12931 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
12932 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12933 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
12936 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
12937 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12938 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12939 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12940 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12943 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
12945 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
12946 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12947 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12948 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12951 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
12953 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
12954 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12955 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12956 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12959 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
12961 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
12962 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12963 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12964 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12967 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
12969 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
12970 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12971 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12972 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12975 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
12977 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
12978 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12979 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12980 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12983 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
12985 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
12986 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12987 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12988 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12991 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
12993 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
12994 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12995 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12996 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12999 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
13001 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
13002 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13003 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13004 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13007 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
13009 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
13010 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13011 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload
13014 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
13016 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
13017 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13018 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload
13021 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
13023 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
13024 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13025 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload
13028 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
13030 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
13031 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13034 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
13036 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
13037 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13041 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
13042 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13046 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
13047 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13051 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13052 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13054 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13055 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13057 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13059 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13060 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13061 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13062 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13064 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13065 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13067 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13069 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13070 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13071 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13072 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13074 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13075 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13077 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13079 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13080 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13081 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13082 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13084 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13085 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13087 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13089 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13090 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13091 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13092 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13094 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13095 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13097 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13099 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13100 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13101 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13102 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13104 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13105 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13107 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13109 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13110 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13111 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13112 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13114 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13115 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13117 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13119 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13120 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13121 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13122 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13124 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13125 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13127 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13129 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13130 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13131 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13132 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13134 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13135 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13137 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13140 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13141 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13142 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13144 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13145 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13147 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13149 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13150 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13151 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13152 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13154 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13155 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13157 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13159 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13160 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13161 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13162 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13164 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13165 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13168 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0]
13169 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13170 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13172 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13173 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
13176 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0]
13177 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13178 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13180 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13181 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2]
13184 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm6[0]
13185 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
13186 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13188 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13189 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2]
13192 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm13[0]
13193 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13194 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13196 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13197 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
13200 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0]
13201 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
13202 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13206 ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13207 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13208 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13209 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13210 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13211 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13214 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13216 ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13217 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13218 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13219 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13223 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13224 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13225 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13226 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13227 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13228 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13231 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13233 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13234 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13235 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13236 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13240 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13241 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13242 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13243 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13244 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13245 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13248 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13250 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13251 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13252 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13253 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13257 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13258 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13259 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13260 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13261 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13262 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13265 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13267 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13268 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13269 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13270 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13273 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13275 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13276 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13277 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13278 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13279 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13280 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13283 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13285 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13286 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13287 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13288 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13289 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13290 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13293 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13295 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13296 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13297 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13298 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13299 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13300 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13303 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13304 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13306 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13307 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13308 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13309 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13310 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13313 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13314 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13316 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13317 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13318 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13319 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13320 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13323 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13324 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13326 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13327 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13328 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13330 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13333 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13334 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13336 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13337 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13338 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13339 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13340 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13343 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13345 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13346 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
13347 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13348 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13349 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13350 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
13351 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13353 ; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13355 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13356 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0]
13357 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13358 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13359 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13360 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
13361 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13365 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13366 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0]
13367 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13368 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13369 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13370 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
13371 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13373 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13375 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13376 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
13377 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13378 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13379 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13380 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
13381 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13383 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13385 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13386 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
13387 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13388 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13389 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13390 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
13391 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13393 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13395 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13396 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
13397 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13398 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13399 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13400 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
13401 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13403 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13405 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13406 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
13407 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13408 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13409 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13410 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
13411 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13413 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13415 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13416 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
13417 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13418 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13419 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13420 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
13421 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13423 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13425 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13426 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
13427 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13428 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13429 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13430 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
13431 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13433 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13435 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13436 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
13437 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13438 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13439 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13440 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
13441 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13443 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13445 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0]
13446 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13447 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13448 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13449 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2]
13450 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13452 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13454 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0]
13455 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13456 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13457 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13458 ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13459 ; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
13460 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13462 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13464 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0]
13465 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13466 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13467 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13468 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
13469 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
13470 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13473 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0]
13474 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13475 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13476 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13477 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13478 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
13479 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13482 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0]
13483 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13484 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13485 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13486 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13487 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
13488 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13491 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0]
13492 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13493 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13494 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13495 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
13496 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13499 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
13500 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13501 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13503 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13505 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13506 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
13508 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13509 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13511 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13513 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13516 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13517 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13519 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13521 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13522 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
13524 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13525 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13527 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13529 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13530 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
13532 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13533 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13535 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13537 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13538 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
13540 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13541 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13543 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13545 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13546 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
13548 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13549 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13551 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13553 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13554 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
13556 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13557 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13559 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13561 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13562 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
13564 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13565 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13567 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13569 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13570 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
13572 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13574 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13576 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
13578 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7]
13580 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13582 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
13584 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7]
13586 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13588 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
13590 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
13594 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
13598 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
13602 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13604 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13607 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13608 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13610 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13612 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13614 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13616 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13618 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13620 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13622 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13624 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13626 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13628 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13630 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13632 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13634 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13636 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13638 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13640 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13642 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13644 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13646 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13648 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13650 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13652 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13654 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13656 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13658 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13660 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13662 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13664 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13666 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13668 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13670 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13672 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13674 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13676 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13678 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13680 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13682 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13684 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13686 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13688 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13690 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13692 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13694 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13696 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13698 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13700 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13702 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13704 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13706 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13708 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13710 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13712 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13714 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13716 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13718 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13720 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13722 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13724 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13726 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13728 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13730 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13732 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13734 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13736 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13738 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13740 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13742 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13744 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13746 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13748 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13750 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13752 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13754 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13756 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13758 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13760 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13762 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13764 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13766 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13768 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13777 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13779 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13781 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13783 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13785 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13787 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13789 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13791 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13793 ; AVX2-FP-NEXT: addq $3432, %rsp # imm = 0xD68
13798 ; AVX2-FCP: # %bb.0:
13799 ; AVX2-FCP-NEXT: subq $3432, %rsp # imm = 0xD68
13801 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13803 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13805 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13807 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13809 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13811 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13813 ; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13815 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13816 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
13817 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
13818 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
13819 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13820 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13822 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13824 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13825 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
13826 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
13827 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
13828 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13829 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13831 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13833 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13834 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
13835 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
13836 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
13837 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13838 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13840 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13842 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13843 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13844 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13846 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13848 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13849 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13850 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13851 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13853 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13855 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13856 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13857 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13859 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13861 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13862 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13863 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13864 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13866 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13868 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13869 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13870 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13872 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13874 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13875 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13876 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13877 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13879 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13881 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13882 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13883 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13885 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13887 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13888 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13889 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13890 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13892 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13894 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13895 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13896 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13898 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13900 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13901 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
13902 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13903 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13905 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13907 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13908 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13909 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13911 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13913 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13914 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
13915 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13916 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13918 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13920 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13921 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13922 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13924 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13926 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0]
13927 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13928 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13930 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13932 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13933 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13934 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13936 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13938 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0]
13939 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13940 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13942 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13944 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13945 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13946 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13948 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13950 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0]
13951 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13952 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13954 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13956 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13957 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
13958 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13961 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm14[0]
13962 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13963 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13966 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13967 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
13968 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13969 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13972 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm10[0]
13973 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13974 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13977 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13978 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
13979 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13980 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13983 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0]
13984 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13985 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13988 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13989 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
13990 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13991 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
13994 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
13995 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13996 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13997 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13998 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14001 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14003 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14004 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14005 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14006 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14009 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14011 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14012 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14013 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14014 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14017 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14019 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14020 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14021 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14022 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14025 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14027 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14028 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14029 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14030 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14033 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14035 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14036 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14037 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14038 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14041 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14043 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14044 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14045 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14046 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14049 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14051 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14052 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14053 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14054 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14057 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14059 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14060 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14061 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14062 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14065 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
14067 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
14068 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14069 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload
14072 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
14074 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
14075 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14076 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload
14079 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
14081 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
14082 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14083 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload
14086 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
14088 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
14089 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14092 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
14094 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
14095 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14099 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
14100 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14104 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
14105 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14109 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14110 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14112 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14113 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14115 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14117 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14118 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14119 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14120 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14122 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14123 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14125 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14127 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14128 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14129 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14130 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14132 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14133 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14135 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14137 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14138 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14139 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14140 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14142 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14143 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14145 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14147 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14148 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14149 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14150 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14152 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14153 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14155 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14157 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14158 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14159 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14160 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14162 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14163 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14165 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14167 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14168 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14169 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14170 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14172 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14173 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14175 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14177 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14178 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14179 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14180 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14182 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14183 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14185 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14187 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14188 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14189 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14190 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14192 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14193 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14195 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14198 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14199 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14200 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14202 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14203 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14205 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14207 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14208 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14209 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14210 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14212 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14213 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14215 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14217 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14218 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14219 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14220 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14222 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14223 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14226 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0]
14227 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14228 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14230 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
14231 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
14234 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0]
14235 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14236 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14238 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
14239 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2]
14242 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm6[0]
14243 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
14244 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14246 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
14247 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2]
14250 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm13[0]
14251 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14252 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14254 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14255 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
14258 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0]
14259 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
14260 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14264 ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14265 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14266 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14267 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14268 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14269 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14272 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14274 ; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14275 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14276 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14277 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14281 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14282 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14283 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14284 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14285 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14286 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14289 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14291 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14292 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14293 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14294 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14298 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14299 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14300 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14301 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14302 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14303 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14306 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14308 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14309 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14310 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14311 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14315 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14316 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14317 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14318 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14319 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14320 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14323 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14325 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14326 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14327 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14328 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14331 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14333 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14334 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14335 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14336 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14337 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14338 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14341 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14343 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14344 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14345 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14346 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14347 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14348 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14351 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14353 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14354 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14355 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14356 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14357 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14358 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14361 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14362 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14364 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14365 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14366 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14367 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14368 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14371 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14372 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14374 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14375 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14376 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14377 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14378 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14381 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14382 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14384 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14385 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14386 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14388 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14391 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14392 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14394 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14395 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14396 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14397 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14398 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14401 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14403 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14404 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
14405 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14406 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14407 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14408 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
14409 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14411 ; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14413 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14414 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0]
14415 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14416 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14417 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14418 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
14419 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14423 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14424 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0]
14425 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14426 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14427 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14428 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
14429 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14431 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14433 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14434 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
14435 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14436 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14437 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14438 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
14439 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14441 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14443 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14444 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
14445 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14446 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14447 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14448 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
14449 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14451 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14453 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14454 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
14455 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14456 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14457 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14458 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
14459 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14461 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14463 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14464 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
14465 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14466 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14467 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14468 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
14469 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14471 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14473 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14474 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
14475 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14476 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14477 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14478 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
14479 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14481 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14483 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14484 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
14485 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14486 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14487 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14488 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
14489 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14491 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14493 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14494 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
14495 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14496 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14497 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14498 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
14499 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14501 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14503 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0]
14504 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14505 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14506 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14507 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2]
14508 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14510 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14512 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0]
14513 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14514 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14515 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14516 ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14517 ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
14518 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14520 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14522 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0]
14523 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14524 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14525 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14526 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
14527 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2]
14528 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14531 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0]
14532 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14533 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14534 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14535 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
14536 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
14537 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14540 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0]
14541 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14542 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14543 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14544 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
14545 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
14546 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14549 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0]
14550 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14551 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14552 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14553 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
14554 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
14557 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
14558 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14559 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14561 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14563 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14564 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14566 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14567 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14569 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14571 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14574 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14575 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14577 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14579 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14580 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14582 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14583 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14585 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14587 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14588 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14590 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14591 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14593 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14595 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14596 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14598 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14599 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14601 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14603 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14604 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14606 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14607 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14609 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14611 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14612 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14614 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14615 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14617 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14619 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14620 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14622 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14623 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14625 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14627 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14628 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14630 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14632 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14634 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
14636 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7]
14638 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14640 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
14642 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7]
14644 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14646 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
14648 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
14652 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
14656 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
14660 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
14662 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14665 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14666 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14668 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14670 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14672 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14674 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14676 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14678 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14680 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14682 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14684 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14686 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14688 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14690 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14692 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14694 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14696 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14698 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14700 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14702 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14704 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14706 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14708 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14710 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14712 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14714 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14716 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14718 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14720 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14722 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14724 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14726 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14728 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14730 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14732 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14734 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14736 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14738 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14740 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14742 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14744 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14746 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14748 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14750 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14752 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14754 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14756 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14758 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14760 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14762 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14764 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14766 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14768 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14770 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14772 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14774 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14776 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14778 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14780 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14782 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14784 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14786 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14788 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14790 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14792 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14794 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14796 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14798 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14800 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14802 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14804 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14806 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14808 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14810 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14812 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14814 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14816 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14818 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14820 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14822 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14824 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14826 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14835 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14837 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14839 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14841 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14843 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14845 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14847 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14849 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14851 ; AVX2-FCP-NEXT: addq $3432, %rsp # imm = 0xD68
14856 ; AVX512: # %bb.0:
14857 ; AVX512-NEXT: subq $7240, %rsp # imm = 0x1C48
14867 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10]
14868 ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
14871 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14874 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14877 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14880 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14881 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11]
14882 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
14885 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14888 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14891 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14894 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14896 ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14899 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14902 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14905 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14908 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14910 ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
14913 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14916 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14919 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14922 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14923 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6]
14924 ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
14928 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14929 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7]
14930 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
14932 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14935 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14937 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14940 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14942 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14945 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14947 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14951 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14954 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14957 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14960 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14963 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14965 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14970 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14973 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14976 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14979 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14982 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14984 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14989 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14993 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14996 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14998 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15001 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15003 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15006 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15008 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15011 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15013 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15015 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15017 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15020 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0]
15023 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15024 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0]
15027 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15028 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0]
15031 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15032 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0]
15035 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15037 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15040 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15042 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15044 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15049 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15052 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15055 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15058 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15061 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15063 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15068 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15071 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15074 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15077 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15080 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15082 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15087 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15090 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15093 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15096 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15099 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15101 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15106 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15109 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15112 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15115 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15118 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15120 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15125 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15128 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15131 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15134 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15137 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15143 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15148 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15153 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15158 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15163 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15168 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15173 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15182 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15188 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15192 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15200 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15206 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15209 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15212 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15215 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15218 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15221 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15224 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15227 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15228 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
15229 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15232 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15233 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
15234 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15237 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15238 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
15239 ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15242 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15243 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
15244 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
15248 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15251 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15254 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15260 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15263 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15269 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15272 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15280 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15288 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15303 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15304 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15308 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15310 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15311 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15312 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15314 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15316 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15317 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15318 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15320 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15322 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15323 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15324 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15326 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15328 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15329 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15330 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15332 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15334 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15335 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15336 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15338 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15340 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15341 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15343 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15345 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15347 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15349 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15350 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15352 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15354 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15355 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15356 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15358 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15360 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15361 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15362 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15364 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15366 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15367 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15368 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15370 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15372 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15373 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15374 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15376 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15378 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15379 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15380 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15382 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15384 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15385 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15386 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15388 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15390 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15391 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15393 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15397 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15398 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15403 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15404 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15405 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15407 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15409 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15410 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15411 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15413 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15415 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15416 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15417 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15420 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15421 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15422 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15425 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15426 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15427 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15430 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15431 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15432 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15435 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15436 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15439 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15444 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15445 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15447 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15449 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15450 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15451 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15453 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15455 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15456 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15457 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15459 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15461 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15462 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15463 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15465 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15467 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15468 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15469 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
15472 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15473 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
15476 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15479 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15480 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
15482 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15483 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
15484 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15486 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15487 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload
15488 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15490 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15491 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload
15492 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15494 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15495 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload
15496 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15498 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15499 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload
15500 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15502 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15503 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload
15504 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15506 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15507 ; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4
15509 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15510 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
15512 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15513 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
15515 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
15516 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload
15518 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
15519 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload
15521 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
15522 ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload
15524 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
15525 ; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19
15527 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
15528 ; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12
15530 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
15531 ; AVX512-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14
15535 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15537 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15539 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15541 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15543 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15545 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15548 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15550 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15552 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15554 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15556 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15558 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15560 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15563 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15565 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15567 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15569 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15571 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15573 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15575 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15580 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15582 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15584 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15586 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15598 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
15607 ; AVX512-NEXT: addq $7240, %rsp # imm = 0x1C48
15612 ; AVX512-FCP: # %bb.0:
15613 ; AVX512-FCP-NEXT: subq $7240, %rsp # imm = 0x1C48
15623 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10]
15624 ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
15627 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15630 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15633 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15636 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15637 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11]
15638 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
15641 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15644 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15647 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15650 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15652 ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15655 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15658 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15661 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15664 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15666 ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15669 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15672 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15675 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15678 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15679 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6]
15680 ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
15684 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15685 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7]
15686 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15688 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15691 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15693 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15696 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15698 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15701 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15703 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15707 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15710 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15713 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15716 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15719 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15721 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15726 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15729 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15732 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15735 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15738 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15740 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15745 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15749 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15752 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15754 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15757 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15759 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15762 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15764 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15767 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15769 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15771 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15773 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15776 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0]
15779 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15780 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0]
15783 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15784 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0]
15787 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15788 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0]
15791 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15793 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15796 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15798 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15800 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15805 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15808 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15811 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15814 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15817 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15819 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15824 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15827 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15830 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15833 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15836 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15838 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15843 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15846 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15849 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15852 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15855 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15857 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15862 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15865 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15868 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15871 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15874 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15876 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15881 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15884 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15887 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15890 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15893 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15899 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15904 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15909 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15914 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15919 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15924 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15929 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15938 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15944 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15948 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15956 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15962 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15965 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15968 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15971 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15974 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15977 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15980 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15983 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15984 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
15985 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15988 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15989 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
15990 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15993 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15994 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
15995 ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15998 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15999 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
16000 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
16004 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16007 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16010 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16016 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16019 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16025 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16028 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16036 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16044 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16059 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16060 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16064 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16066 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16067 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16068 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16070 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16072 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16073 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16074 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16076 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16078 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16079 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16080 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16082 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16084 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16085 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16086 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16088 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16090 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16091 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16092 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16094 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16096 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16097 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16099 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16101 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16103 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16105 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16106 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16108 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16110 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16111 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16112 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16114 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16116 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16117 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16118 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16120 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16122 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16123 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16124 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16126 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16128 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16129 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16130 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16132 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16134 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16135 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16136 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16138 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16140 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16141 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16142 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16144 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16146 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16147 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16149 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16153 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16154 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16159 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16160 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16161 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16163 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16165 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16166 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16167 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16169 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16171 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16172 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16173 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16176 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16177 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16178 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16181 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16182 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16183 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16186 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16187 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16188 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16191 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16192 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16195 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16200 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16201 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16203 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16205 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16206 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16207 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16209 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16211 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16212 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16213 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16215 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16217 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16218 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16219 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16221 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16223 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16224 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16225 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
16228 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16229 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
16232 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16235 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16236 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
16238 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16239 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
16240 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16242 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16243 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload
16244 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16246 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16247 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload
16248 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16250 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16251 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload
16252 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16254 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16255 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload
16256 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16258 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16259 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload
16260 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16262 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16263 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4
16265 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16266 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
16268 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16269 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
16271 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
16272 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload
16274 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
16275 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload
16277 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
16278 ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload
16280 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
16281 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19
16283 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
16284 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12
16286 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
16287 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14
16291 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16293 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16295 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16297 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16299 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16301 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16304 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16306 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16308 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16310 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16312 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16314 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16316 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16319 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16321 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16323 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16325 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16327 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16329 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16331 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16336 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16338 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16340 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16342 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16354 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16363 ; AVX512-FCP-NEXT: addq $7240, %rsp # imm = 0x1C48
16368 ; AVX512DQ: # %bb.0:
16369 ; AVX512DQ-NEXT: subq $7240, %rsp # imm = 0x1C48
16379 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10]
16380 ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
16383 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16386 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16389 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16392 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16393 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11]
16394 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
16397 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16400 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16403 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16406 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16408 ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16411 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16414 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16417 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16420 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16422 ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16425 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16428 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16431 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16434 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16435 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6]
16436 ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
16440 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16441 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7]
16442 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
16444 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16447 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16449 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16452 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16454 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16457 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16459 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16463 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16466 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16469 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16472 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16475 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16477 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16482 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16485 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16488 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16491 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16494 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16496 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16501 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16505 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16508 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16510 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16513 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16515 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16518 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16520 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16523 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16525 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16527 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16529 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16532 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0]
16535 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16536 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0]
16539 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16540 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0]
16543 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16544 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0]
16547 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16549 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16552 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16554 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16556 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16561 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16564 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16567 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16570 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16573 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16575 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16580 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16583 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16586 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16589 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16592 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16594 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16599 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16602 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16605 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16608 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16611 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16613 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16618 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16621 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16624 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16627 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16630 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16632 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16637 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16640 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16643 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16646 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16649 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16655 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16660 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16665 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16670 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16675 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16680 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16685 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16694 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16700 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16704 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16712 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16718 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16721 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16724 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16727 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16730 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16733 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16736 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16739 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16740 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
16741 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
16744 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16745 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
16746 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
16749 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16750 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
16751 ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
16754 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16755 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
16756 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
16760 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16763 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16766 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16772 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16775 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16781 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16784 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16792 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16800 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16815 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16816 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16820 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16822 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16823 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16824 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16826 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16828 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16829 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16830 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16832 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16834 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16835 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16836 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16838 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16840 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16841 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16842 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16844 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16846 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16847 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16848 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16850 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16852 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16853 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16855 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16857 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16859 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16861 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16862 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16864 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16866 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16867 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16868 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16870 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16872 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16873 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16874 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16876 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16878 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16879 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16880 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16882 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16884 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16885 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16886 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16888 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16890 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16891 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16892 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16894 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16896 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16897 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16898 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16900 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16902 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16903 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16905 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16909 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16910 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16915 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16916 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16917 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16919 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16921 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16922 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16923 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16925 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16927 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16928 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16929 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16932 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16933 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16934 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16937 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16938 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16939 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16942 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16943 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16944 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16947 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16948 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16951 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16956 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16957 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16959 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16961 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16962 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16963 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16965 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16967 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16968 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16969 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16971 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16973 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16974 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16975 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16977 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16979 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16980 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16981 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
16984 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16985 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
16988 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16991 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16992 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
16994 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16995 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
16996 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16998 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16999 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload
17000 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17002 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17003 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload
17004 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17006 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17007 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload
17008 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17010 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17011 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload
17012 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17014 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17015 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload
17016 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17018 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17019 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4
17021 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17022 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
17024 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17025 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
17027 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
17028 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload
17030 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
17031 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload
17033 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
17034 ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload
17036 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
17037 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19
17039 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
17040 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12
17042 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17043 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14
17047 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17049 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17051 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17053 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17055 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17057 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17060 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17062 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17064 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17066 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17068 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17070 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17072 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17075 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17077 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17079 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17081 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17083 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17085 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17087 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17092 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17094 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17096 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17098 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17110 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
17119 ; AVX512DQ-NEXT: addq $7240, %rsp # imm = 0x1C48
17124 ; AVX512DQ-FCP: # %bb.0:
17125 ; AVX512DQ-FCP-NEXT: subq $7240, %rsp # imm = 0x1C48
17135 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10]
17136 ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
17139 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17142 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17145 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17148 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17149 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11]
17150 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
17153 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17156 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17159 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17162 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17164 ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17167 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17170 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17173 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17176 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17178 ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17181 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17184 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17187 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17190 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17191 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6]
17192 ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
17196 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17197 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7]
17198 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
17200 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17203 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17205 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17208 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17210 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17213 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17215 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17219 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17222 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17225 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17228 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17231 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17233 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17238 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17241 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17244 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17247 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17250 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17252 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17257 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17261 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17264 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17266 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17269 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17271 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17274 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17276 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17279 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17281 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17283 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17285 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17288 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0]
17291 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17292 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0]
17295 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17296 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0]
17299 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17300 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0]
17303 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17305 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17308 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17310 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17312 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17317 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17320 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17323 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17326 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17329 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17331 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17336 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17339 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17342 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17345 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17348 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17350 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17355 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17358 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17361 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17364 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17367 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17369 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17374 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17377 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17380 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17383 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17386 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17388 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17393 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17396 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17399 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17402 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17405 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17411 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17416 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17421 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17426 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17431 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17436 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17441 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17450 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17456 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17460 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17468 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17474 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17477 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17480 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17483 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17486 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17489 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17492 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17495 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17496 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
17497 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
17500 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17501 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
17502 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
17505 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17506 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
17507 ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
17510 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17511 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
17512 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
17516 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17519 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17522 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17528 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17531 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17537 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17540 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17548 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17556 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17571 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17572 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17576 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17578 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17579 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17580 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17582 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17584 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17585 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17586 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17588 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17590 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17591 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17592 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17594 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17596 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17597 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17598 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17600 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17602 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17603 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17604 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17606 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17608 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17609 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17611 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17613 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17615 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17617 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17618 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17620 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17622 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17623 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17624 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17626 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17628 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17629 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17630 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17632 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17634 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17635 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17636 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17638 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17640 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17641 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17642 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17644 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17646 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17647 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17648 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17650 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17652 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17653 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17654 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17656 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17658 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17659 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17661 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17665 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17666 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17671 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17672 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17673 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17675 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17677 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17678 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17679 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17681 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17683 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17684 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17685 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17688 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17689 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17690 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17693 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17694 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17695 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17698 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17699 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17700 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17703 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17704 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17707 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17712 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17713 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17715 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17717 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17718 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17719 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17721 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17723 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17724 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17725 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17727 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17729 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17730 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17731 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17733 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17735 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17736 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17737 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
17740 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17741 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
17744 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17747 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17748 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
17750 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17751 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
17752 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17754 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17755 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload
17756 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17758 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17759 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload
17760 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17762 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17763 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload
17764 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17766 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17767 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload
17768 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17770 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17771 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload
17772 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17774 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17775 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4
17777 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17778 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
17780 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17781 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
17783 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
17784 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload
17786 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
17787 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload
17789 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
17790 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload
17792 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
17793 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19
17795 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
17796 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12
17798 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17799 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14
17803 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17805 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17807 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17809 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17811 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17813 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17816 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17818 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17820 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17822 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17824 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17826 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17828 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17831 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17833 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17835 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17837 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17839 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17841 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17843 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17848 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17850 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17852 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17854 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17866 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17875 ; AVX512DQ-FCP-NEXT: addq $7240, %rsp # imm = 0x1C48
17880 ; AVX512BW: # %bb.0:
17881 ; AVX512BW-NEXT: subq $7240, %rsp # imm = 0x1C48
17891 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10]
17892 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
17895 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17898 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17901 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17904 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17905 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11]
17906 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
17909 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17912 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17915 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17918 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17920 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17923 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17926 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17929 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17932 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17934 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17937 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17940 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17943 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17946 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17947 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6]
17948 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
17952 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17953 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7]
17954 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
17956 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17959 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17961 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17964 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17966 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17969 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17971 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17975 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17978 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17981 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17984 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17987 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17989 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17994 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17997 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18000 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18003 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18006 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18008 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18013 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18017 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18020 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18022 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18025 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18027 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18030 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18032 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18035 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18037 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18039 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18041 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18044 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0]
18047 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18048 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0]
18051 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18052 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0]
18055 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18056 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0]
18059 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18061 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18064 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18066 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18068 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18073 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18076 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18079 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18082 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18085 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18087 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18092 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18095 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18098 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18101 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18104 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18106 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18111 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18114 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18117 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18120 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18123 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18125 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18130 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18133 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18136 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18139 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18142 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18144 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18149 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18152 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18155 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18158 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18161 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18167 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18172 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18177 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18182 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18187 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18192 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18197 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18206 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18212 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18216 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18224 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18230 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18233 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18236 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18239 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18242 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18245 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18248 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18251 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18252 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
18253 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
18256 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18257 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
18258 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
18261 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18262 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
18263 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
18266 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18267 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
18268 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
18272 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18275 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18278 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18284 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18287 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18293 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18296 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18304 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18312 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18327 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18328 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18332 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18334 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18335 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18336 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18338 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18340 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18341 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18342 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18344 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18346 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18347 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18348 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18350 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18352 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18353 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18354 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18356 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18358 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18359 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18360 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18362 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18364 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18365 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18367 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18369 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18371 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18373 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18374 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18376 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18378 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18379 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18380 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18382 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18384 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18385 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18386 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18388 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18390 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18391 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18392 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18394 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18396 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18397 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18398 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18400 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18402 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18403 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18404 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18406 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18408 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18409 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18410 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18412 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18414 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18415 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18417 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18421 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18422 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18427 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18428 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18429 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18431 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18433 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18434 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18435 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18437 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18439 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18440 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18441 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18444 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18445 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18446 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18449 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18450 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18451 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18454 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18455 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18456 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18459 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18460 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18463 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18468 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18469 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18471 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18473 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18474 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18475 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18477 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18479 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18480 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18481 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18483 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18485 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18486 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18487 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18489 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18491 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18492 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18493 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
18496 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18497 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
18500 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18503 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18504 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
18506 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18507 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
18508 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18510 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18511 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload
18512 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18514 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18515 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload
18516 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18518 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18519 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload
18520 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18522 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18523 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload
18524 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18526 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18527 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload
18528 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18530 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18531 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4
18533 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18534 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
18536 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18537 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
18539 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
18540 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload
18542 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
18543 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload
18545 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
18546 ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload
18548 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
18549 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19
18551 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
18552 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12
18554 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18555 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14
18559 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18561 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18563 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18565 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18567 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18569 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18572 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18574 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18576 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18578 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18580 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18582 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18584 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18587 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18589 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18591 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18593 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18595 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18597 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18599 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18604 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18606 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18608 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18610 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18622 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
18631 ; AVX512BW-NEXT: addq $7240, %rsp # imm = 0x1C48
18636 ; AVX512BW-FCP: # %bb.0:
18637 ; AVX512BW-FCP-NEXT: subq $7240, %rsp # imm = 0x1C48
18647 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10]
18648 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
18651 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18654 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18657 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18660 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18661 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11]
18662 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
18665 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18668 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18671 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18674 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18676 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18679 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18682 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18685 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18688 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18690 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18693 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18696 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18699 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18702 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18703 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6]
18704 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
18708 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18709 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7]
18710 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
18712 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18715 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18717 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18720 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18722 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18725 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18727 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18731 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18734 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18737 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18740 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18743 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18745 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18750 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18753 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18756 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18759 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18762 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18764 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18769 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18773 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18776 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18778 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18781 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18783 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18786 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18788 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18791 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18793 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18795 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18797 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18800 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0]
18803 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18804 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0]
18807 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18808 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0]
18811 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18812 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0]
18815 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18817 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18820 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18822 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18824 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18829 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18832 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18835 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18838 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18841 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18843 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18848 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18851 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18854 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18857 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18860 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18862 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18867 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18870 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18873 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18876 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18879 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18881 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18886 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18889 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18892 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18895 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18898 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18900 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18905 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18908 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18911 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18914 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18917 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18923 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18928 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18933 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18938 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18943 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18948 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18953 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18962 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18968 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18972 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18980 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18986 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18989 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18992 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18995 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18998 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19001 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19004 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19007 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19008 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
19009 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
19012 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19013 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
19014 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
19017 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19018 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
19019 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
19022 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19023 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
19024 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
19028 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19031 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19034 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19040 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19043 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19049 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19052 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19060 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19068 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19083 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19084 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19088 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19090 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19091 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19092 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19094 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19096 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19097 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19098 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19100 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19102 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19103 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19104 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19106 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19108 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19109 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19110 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19112 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19114 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19115 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19116 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19118 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19120 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19121 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19123 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19125 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19127 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19129 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19130 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19132 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19134 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19135 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19136 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19138 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19140 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19141 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19142 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19144 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19146 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19147 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19148 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19150 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19152 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19153 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19154 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19156 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19158 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19159 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19160 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19162 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19164 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19165 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19166 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19168 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19170 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19171 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19173 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19177 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19178 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19183 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19184 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19185 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19187 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19189 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19190 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19191 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19193 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19195 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19196 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19197 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19200 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19201 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19202 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19205 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19206 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19207 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19210 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19211 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19212 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19215 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19216 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19219 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19224 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19225 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19227 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19229 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19230 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19231 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19233 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19235 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19236 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19237 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19239 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19241 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19242 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19243 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19245 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19247 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19248 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19249 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
19252 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19253 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
19256 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19259 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19260 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
19262 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19263 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
19264 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19266 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19267 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload
19268 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19270 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19271 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload
19272 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19274 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19275 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload
19276 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19278 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19279 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload
19280 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19282 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19283 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload
19284 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19286 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19287 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4
19289 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19290 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
19292 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19293 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
19295 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
19296 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload
19298 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19299 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload
19301 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
19302 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload
19304 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
19305 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19
19307 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19308 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12
19310 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19311 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14
19315 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19317 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19319 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19321 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19323 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19325 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19328 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19330 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19332 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19334 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19336 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19338 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19340 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19343 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19345 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19347 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19349 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19351 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19353 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19355 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19360 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19362 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19364 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19366 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19378 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
19387 ; AVX512BW-FCP-NEXT: addq $7240, %rsp # imm = 0x1C48
19392 ; AVX512DQ-BW: # %bb.0:
19393 ; AVX512DQ-BW-NEXT: subq $7240, %rsp # imm = 0x1C48
19403 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10]
19404 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
19407 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19410 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19413 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19416 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19417 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11]
19418 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
19421 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19424 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19427 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19430 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19432 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19435 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19438 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19441 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19444 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19446 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19449 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19452 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19455 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19458 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19459 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6]
19460 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
19464 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19465 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7]
19466 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
19468 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19471 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19473 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19476 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19478 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19481 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19483 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19487 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19490 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19493 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19496 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19499 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19501 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19506 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19509 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19512 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19515 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19518 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19520 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19525 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19529 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19532 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19534 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19537 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19539 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19542 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19544 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19547 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19549 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19551 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19553 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19556 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0]
19559 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19560 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0]
19563 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19564 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0]
19567 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19568 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0]
19571 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19573 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19576 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19578 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19580 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19585 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19588 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19591 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19594 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19597 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19599 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19604 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19607 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19610 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19613 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19616 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19618 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19623 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19626 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19629 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19632 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19635 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19637 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19642 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19645 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19648 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19651 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19654 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19656 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19661 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19664 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19667 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19670 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19673 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19679 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19684 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19689 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19694 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19699 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19704 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19709 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19718 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19724 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19728 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19736 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19742 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19745 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19748 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19751 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19754 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19757 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19760 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19763 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19764 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
19765 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
19768 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19769 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
19770 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
19773 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19774 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
19775 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
19778 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19779 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
19780 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
19784 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19787 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19790 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19796 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19799 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19805 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19808 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19816 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19824 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19839 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19840 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19844 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19846 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19847 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19848 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19850 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19852 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19853 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19854 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19856 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19858 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19859 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19860 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19862 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19864 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19865 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19866 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19868 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19870 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19871 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19872 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19874 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19876 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19877 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19879 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19881 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19883 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19885 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19886 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19888 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19890 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19891 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19892 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19894 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19896 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19897 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19898 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19900 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19902 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19903 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19904 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19906 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19908 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19909 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19910 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19912 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19914 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19915 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19916 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19918 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19920 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19921 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19922 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19924 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19926 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19927 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19929 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19933 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19934 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19939 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19940 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19941 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19943 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19945 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19946 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19947 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19949 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19951 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19952 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19953 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19956 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19957 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19958 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19961 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19962 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19963 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19966 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19967 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19968 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19971 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19972 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19975 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19980 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19981 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19983 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19985 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19986 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19987 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19989 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19991 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19992 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19993 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19995 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19997 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19998 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19999 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20001 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20003 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20004 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20005 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
20008 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20009 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
20012 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20015 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20016 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
20018 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20019 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
20020 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20022 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20023 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload
20024 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20026 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20027 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload
20028 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20030 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20031 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload
20032 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20034 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20035 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload
20036 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20038 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20039 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload
20040 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20042 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20043 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4
20045 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20046 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
20048 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20049 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
20051 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
20052 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload
20054 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20055 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload
20057 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
20058 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload
20060 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
20061 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19
20063 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20064 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12
20066 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20067 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14
20071 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20073 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20075 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20077 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20079 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20081 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20084 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20086 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20088 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20090 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20092 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20094 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20096 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20099 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20101 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20103 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20105 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20107 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20109 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20111 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20116 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20118 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20120 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20122 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20134 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
20143 ; AVX512DQ-BW-NEXT: addq $7240, %rsp # imm = 0x1C48
20148 ; AVX512DQ-BW-FCP: # %bb.0:
20149 ; AVX512DQ-BW-FCP-NEXT: subq $7240, %rsp # imm = 0x1C48
20159 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10]
20160 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
20163 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20166 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20169 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20172 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20173 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11]
20174 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
20177 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20180 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20183 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20186 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20188 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20191 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20194 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20197 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20200 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20202 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20205 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20208 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20211 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20214 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20215 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6]
20216 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
20220 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20221 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7]
20222 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
20224 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20227 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20229 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20232 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20234 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20237 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20239 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20243 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20246 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20249 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20252 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20255 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20257 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20262 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20265 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20268 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20271 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20274 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20276 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20281 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20285 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20288 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20290 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20293 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20295 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20298 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20300 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20303 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20305 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20307 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20309 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20312 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0]
20315 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20316 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0]
20319 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20320 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0]
20323 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20324 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0]
20327 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20329 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20332 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20334 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20336 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20341 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20344 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20347 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20350 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20353 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20355 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20360 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20363 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20366 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20369 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20372 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20374 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20379 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20382 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20385 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20388 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20391 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20393 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20398 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20401 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20404 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20407 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20410 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20412 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20417 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20420 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20423 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20426 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20429 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20435 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20440 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20445 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20450 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20455 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20460 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20465 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20474 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20480 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20484 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20492 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20498 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20501 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20504 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20507 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20510 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20513 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20516 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20519 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20520 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12]
20521 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
20524 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20525 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13]
20526 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
20529 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20530 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6]
20531 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
20534 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20535 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7]
20536 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
20540 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20543 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20546 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20552 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20555 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20561 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20564 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20572 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20580 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20595 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20596 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20600 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20602 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20603 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20604 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20606 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20608 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20609 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20610 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20612 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20614 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20615 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20616 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20618 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20620 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20621 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20622 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20624 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20626 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20627 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20628 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20630 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20632 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20633 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20635 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20637 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20639 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20641 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20642 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20644 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20646 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20647 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20648 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20650 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20652 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20653 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20654 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20656 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20658 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20659 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20660 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20662 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20664 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20665 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20666 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20668 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20670 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20671 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20672 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20674 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20676 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20677 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20678 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20680 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20682 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20683 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20685 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20689 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20690 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20695 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20696 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20697 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20699 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20701 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20702 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20703 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20705 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20707 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20708 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20709 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20712 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20713 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20714 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20717 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20718 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20719 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20722 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20723 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20724 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20727 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20728 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20731 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20736 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20737 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20739 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20741 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20742 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20743 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20745 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20747 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20748 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20749 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20751 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20753 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20754 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20755 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20757 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20759 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20760 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20761 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
20764 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20765 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
20768 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20771 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20772 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
20774 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20775 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
20776 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20778 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20779 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload
20780 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20782 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20783 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload
20784 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20786 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20787 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload
20788 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20790 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20791 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload
20792 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20794 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20795 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload
20796 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20798 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20799 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4
20801 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20802 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
20804 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20805 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload
20807 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
20808 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload
20810 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20811 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload
20813 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
20814 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload
20816 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
20817 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19
20819 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20820 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12
20822 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20823 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14
20827 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20829 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20831 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20833 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20835 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20837 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20840 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20842 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20844 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20846 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20848 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20850 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20852 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20855 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20857 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20859 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20861 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20863 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20865 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20867 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20872 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20874 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20876 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20878 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20890 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
20899 ; AVX512DQ-BW-FCP-NEXT: addq $7240, %rsp # imm = 0x1C48
20903 %strided.vec0 = shufflevector <384 x i64> %wide.vec, <384 x i64> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378>