Lines Matching +full:0 +full:x1c8

20 ; SSE:       # %bb.0:
24 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
31 ; AVX: # %bb.0:
32 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
33 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
34 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
40 ; AVX2: # %bb.0:
43 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
49 ; AVX2-FP: # %bb.0:
52 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
58 ; AVX2-FCP: # %bb.0:
61 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
67 ; AVX512: # %bb.0:
70 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
76 ; AVX512-FCP: # %bb.0:
79 ; AVX512-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
85 ; AVX512DQ: # %bb.0:
88 ; AVX512DQ-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
94 ; AVX512DQ-FCP: # %bb.0:
97 ; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
103 ; AVX512BW: # %bb.0:
106 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
112 ; AVX512BW-FCP: # %bb.0:
115 ; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
121 ; AVX512DQ-BW: # %bb.0:
124 ; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
130 ; AVX512DQ-BW-FCP: # %bb.0:
133 ; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
139 %1 = shufflevector <2 x i64> %in.vec0, <2 x i64> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
140 %interleaved.vec = shufflevector <4 x i64> %1, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
147 ; SSE: # %bb.0:
154 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
157 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
165 ; AVX: # %bb.0:
169 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
173 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3]
180 ; AVX2: # %bb.0:
183 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
184 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
185 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
186 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
188 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
195 ; AVX2-FP: # %bb.0:
198 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
199 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
200 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
201 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
203 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
210 ; AVX2-FCP: # %bb.0:
213 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
214 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
215 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
216 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
218 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
225 ; AVX512: # %bb.0:
228 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
235 ; AVX512-FCP: # %bb.0:
238 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
245 ; AVX512DQ: # %bb.0:
248 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
255 ; AVX512DQ-FCP: # %bb.0:
258 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
265 ; AVX512BW: # %bb.0:
268 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
275 ; AVX512BW-FCP: # %bb.0:
278 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
285 ; AVX512DQ-BW: # %bb.0:
288 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
295 ; AVX512DQ-BW-FCP: # %bb.0:
298 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
305 %1 = shufflevector <4 x i64> %in.vec0, <4 x i64> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
306 %interleaved.vec = shufflevector <8 x i64> %1, <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
313 ; SSE: # %bb.0:
324 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
327 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
330 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
333 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0]
345 ; AVX: # %bb.0:
351 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
354 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0]
358 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
361 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3]
370 ; AVX2: # %bb.0:
375 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3]
377 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
378 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
379 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
380 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
381 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
383 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
384 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
385 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
386 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
395 ; AVX2-FP: # %bb.0:
400 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3]
402 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
403 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
404 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
405 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
406 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
408 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
409 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
410 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
411 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
420 ; AVX2-FCP: # %bb.0:
425 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3]
427 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
428 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
429 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
430 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
431 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
433 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
434 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
435 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
436 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
445 ; AVX512: # %bb.0:
448 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
458 ; AVX512-FCP: # %bb.0:
461 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
471 ; AVX512DQ: # %bb.0:
474 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
484 ; AVX512DQ-FCP: # %bb.0:
487 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
497 ; AVX512BW: # %bb.0:
500 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
510 ; AVX512BW-FCP: # %bb.0:
513 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
523 ; AVX512DQ-BW: # %bb.0:
526 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
536 ; AVX512DQ-BW-FCP: # %bb.0:
539 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
549 %1 = shufflevector <8 x i64> %in.vec0, <8 x i64> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
550 %interleaved.vec = shufflevector <16 x i64> %1, <16 x i64> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
557 ; SSE: # %bb.0:
575 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
576 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0]
579 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
582 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0]
585 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0]
588 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0]
591 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0]
594 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0]
598 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
614 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
619 ; AVX: # %bb.0:
629 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
632 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
635 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
638 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
642 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3]
645 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[3],ymm5[3]
648 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[3],ymm6[3]
651 ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3]
664 ; AVX2: # %bb.0:
673 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3]
675 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
676 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
677 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
678 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
679 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3]
681 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
682 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
683 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
684 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7]
685 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3]
687 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
688 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
689 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
690 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
691 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3]
693 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
694 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
695 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
696 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
709 ; AVX2-FP: # %bb.0:
718 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3]
720 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
721 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
722 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
723 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
724 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3]
726 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
727 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
728 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
729 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7]
730 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3]
732 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
733 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
734 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
735 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
736 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3]
738 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
739 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
740 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
741 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
754 ; AVX2-FCP: # %bb.0:
763 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3]
765 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
766 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
767 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
768 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
769 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3]
771 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
772 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
773 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
774 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7]
775 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3]
777 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
778 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
779 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
780 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
781 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3]
783 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
784 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
785 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
786 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
799 ; AVX512: # %bb.0:
807 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
819 ; AVX512-FCP: # %bb.0:
827 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
839 ; AVX512DQ: # %bb.0:
847 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
859 ; AVX512DQ-FCP: # %bb.0:
867 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
879 ; AVX512BW: # %bb.0:
887 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
899 ; AVX512BW-FCP: # %bb.0:
907 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
919 ; AVX512DQ-BW: # %bb.0:
927 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
939 ; AVX512DQ-BW-FCP: # %bb.0:
947 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
959 %1 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
960 %interleaved.vec = shufflevector <32 x i64> %1, <32 x i64> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
967 ; SSE: # %bb.0:
985 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
986 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
988 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
990 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0]
991 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
993 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
995 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
996 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
998 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1000 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0]
1001 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1003 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1005 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1008 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1010 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1011 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1013 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1015 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1016 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1018 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1021 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1022 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1024 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1028 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1029 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1034 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
1039 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0]
1044 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
1049 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
1054 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
1059 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1064 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1081 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1083 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1085 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1087 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1089 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1091 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1093 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1095 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1099 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1101 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1103 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1105 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1107 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1109 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1111 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1113 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1119 ; AVX: # %bb.0:
1123 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1125 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1129 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1140 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
1143 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
1146 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0]
1149 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0]
1154 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0]
1159 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0]
1163 ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3]
1166 ; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[3],ymm9[3]
1169 ; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[3],ymm10[3]
1172 ; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[3],ymm11[3]
1175 ; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[3],ymm12[3]
1178 ; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[3],ymm13[3]
1181 ; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[3],ymm14[3]
1184 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[3],ymm15[3]
1200 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1206 ; AVX2: # %bb.0:
1221 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3]
1223 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
1224 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1225 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
1226 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
1227 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
1228 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1229 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3]
1231 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
1232 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
1233 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3]
1234 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7]
1235 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
1237 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
1238 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1]
1239 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
1240 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
1241 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3]
1243 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
1244 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1]
1245 ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
1246 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
1247 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3]
1249 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
1250 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
1251 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
1252 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7]
1253 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3]
1255 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
1256 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
1257 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
1258 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7]
1259 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3]
1261 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
1263 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
1264 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
1265 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
1267 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3]
1269 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1270 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1]
1271 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3]
1272 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
1287 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1289 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1295 ; AVX2-FP: # %bb.0:
1310 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3]
1312 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
1313 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1314 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
1315 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
1316 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
1317 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1318 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3]
1320 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
1321 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
1322 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3]
1323 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7]
1324 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
1326 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
1327 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1]
1328 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
1329 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
1330 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3]
1332 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
1333 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1]
1334 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
1335 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
1336 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3]
1338 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
1339 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
1340 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
1341 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7]
1342 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3]
1344 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
1345 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
1346 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
1347 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7]
1348 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3]
1350 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
1352 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
1353 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
1354 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
1356 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3]
1358 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1359 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1]
1360 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3]
1361 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
1376 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1378 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1384 ; AVX2-FCP: # %bb.0:
1399 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3]
1401 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
1402 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1403 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
1404 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
1405 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
1406 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1407 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3]
1409 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
1410 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
1411 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3]
1412 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7]
1413 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
1415 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
1416 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1]
1417 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
1418 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
1419 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3]
1421 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
1422 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1]
1423 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
1424 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
1425 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3]
1427 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
1428 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
1429 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
1430 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7]
1431 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3]
1433 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
1434 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
1435 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
1436 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7]
1437 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3]
1439 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
1441 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
1442 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
1443 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
1445 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3]
1447 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1448 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1]
1449 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3]
1450 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
1465 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1467 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1473 ; AVX512: # %bb.0:
1485 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1507 ; AVX512-FCP: # %bb.0:
1519 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1541 ; AVX512DQ: # %bb.0:
1553 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1575 ; AVX512DQ-FCP: # %bb.0:
1587 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1609 ; AVX512BW: # %bb.0:
1621 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1643 ; AVX512BW-FCP: # %bb.0:
1655 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1677 ; AVX512DQ-BW: # %bb.0:
1689 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1711 ; AVX512DQ-BW-FCP: # %bb.0:
1723 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1745 %1 = shufflevector <32 x i64> %in.vec0, <32 x i64> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1746 %interleaved.vec = shufflevector <64 x i64> %1, <64 x i64> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
1753 ; SSE: # %bb.0:
1754 ; SSE-NEXT: subq $664, %rsp # imm = 0x298
1771 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
1772 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1774 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1776 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1777 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1779 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1781 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0]
1782 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1784 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1786 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
1787 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1789 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1791 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1792 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1794 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1796 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1797 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1799 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1801 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1802 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1804 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1807 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1808 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1810 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1814 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1815 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1817 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1821 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1822 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1824 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1828 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1829 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1831 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1835 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1836 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1838 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1842 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1843 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1845 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1849 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1850 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1852 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1856 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1857 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1859 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1863 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1864 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1866 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1870 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1871 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1873 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1877 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1878 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1880 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1884 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1885 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1887 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1891 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1892 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1894 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1898 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1901 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1905 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1906 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1908 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1912 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1913 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1915 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1919 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1920 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1925 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1926 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1931 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1932 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1937 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
1942 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0]
1947 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
1952 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
1957 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1962 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1977 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1980 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1983 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1985 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1987 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1989 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1991 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1993 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1997 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1999 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2001 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2003 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2005 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2007 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2009 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2011 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2013 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2015 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2017 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2019 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2021 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2023 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2025 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2027 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2029 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2031 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2033 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2035 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2037 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2039 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2041 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2043 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2045 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2047 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2049 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2051 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2053 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2055 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2057 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2059 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2061 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2063 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2065 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2067 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2069 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2071 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2073 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2075 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2077 ; SSE-NEXT: addq $664, %rsp # imm = 0x298
2081 ; AVX: # %bb.0:
2082 ; AVX-NEXT: subq $424, %rsp # imm = 0x1A8
2092 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
2094 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2096 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
2098 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2100 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0]
2102 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2104 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm3[0]
2106 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2110 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2112 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2116 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2118 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2122 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2124 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2128 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2130 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2134 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2136 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2140 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2142 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2146 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2148 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2152 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2154 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2158 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2164 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2166 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2170 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2172 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2176 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2178 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2181 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2182 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2185 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2188 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2191 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2194 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2197 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2200 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2203 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[3],ymm0[3]
2206 ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3]
2209 ; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[3],ymm9[3]
2212 ; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[3],ymm10[3]
2215 ; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[3],ymm11[3]
2218 ; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[3],ymm12[3]
2221 ; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[3],ymm13[3]
2224 ; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[3],ymm14[3]
2227 ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[3],ymm15[3]
2243 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2245 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2247 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2249 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2253 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2255 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2257 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2259 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2261 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2263 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2265 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2267 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2269 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2271 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2273 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2275 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2277 ; AVX-NEXT: addq $424, %rsp # imm = 0x1A8
2282 ; AVX2: # %bb.0:
2283 ; AVX2-NEXT: subq $456, %rsp # imm = 0x1C8
2298 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1]
2299 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3]
2300 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
2301 ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2302 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
2304 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
2305 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2306 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1]
2307 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3]
2308 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
2309 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2310 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
2312 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2313 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2314 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1]
2315 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3]
2316 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2317 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2318 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3]
2320 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
2321 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2322 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1]
2323 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3]
2324 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
2325 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2326 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3]
2328 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
2329 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2330 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1]
2331 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3]
2332 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2333 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2334 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3]
2336 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7]
2337 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2338 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1]
2339 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3]
2340 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
2341 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2342 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3]
2344 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
2345 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2346 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1]
2347 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3]
2348 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
2349 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2351 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
2353 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2356 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1]
2357 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3]
2358 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2359 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2360 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2362 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2363 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2366 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2367 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2368 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2369 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2370 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2372 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2373 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2376 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2377 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2378 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2379 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2381 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2384 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2385 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2386 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2387 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2389 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2392 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2393 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3]
2394 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
2395 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2397 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2400 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1]
2401 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3]
2402 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
2403 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2405 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
2408 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1]
2409 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm9[0,1,1,3]
2410 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
2411 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3]
2413 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
2416 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,0,2,1]
2417 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,1,1,3]
2418 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
2419 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
2421 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
2424 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,0,2,1]
2425 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,1,1,3]
2426 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
2427 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm15[0,2,2,3]
2429 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
2444 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2446 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2448 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2450 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2454 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2456 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2458 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2460 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2462 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2464 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2466 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2468 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2470 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2472 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2474 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2476 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2478 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2480 ; AVX2-NEXT: addq $456, %rsp # imm = 0x1C8
2485 ; AVX2-FP: # %bb.0:
2486 ; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8
2501 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1]
2502 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3]
2503 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
2504 ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2505 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
2507 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
2508 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2509 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1]
2510 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3]
2511 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
2512 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2513 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
2515 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2516 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2517 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1]
2518 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3]
2519 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2520 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2521 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3]
2523 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
2524 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2525 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1]
2526 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3]
2527 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
2528 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2529 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3]
2531 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
2532 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2533 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1]
2534 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3]
2535 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2536 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2537 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3]
2539 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7]
2540 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2541 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1]
2542 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3]
2543 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
2544 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2545 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3]
2547 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
2548 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2549 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1]
2550 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3]
2551 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
2552 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2554 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
2556 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2559 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1]
2560 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3]
2561 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2562 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2563 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2565 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2566 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2569 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2570 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2571 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2572 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2573 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2575 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2576 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2579 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2580 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2581 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2582 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2584 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2587 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2588 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2589 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2590 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2592 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2595 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2596 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3]
2597 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
2598 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2600 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2603 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1]
2604 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3]
2605 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
2606 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2608 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
2611 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1]
2612 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm9[0,1,1,3]
2613 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
2614 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3]
2616 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
2619 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,0,2,1]
2620 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,1,1,3]
2621 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
2622 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
2624 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
2627 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,0,2,1]
2628 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,1,1,3]
2629 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
2630 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm15[0,2,2,3]
2632 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
2647 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2649 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2651 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2653 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2657 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2659 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2661 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2663 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2665 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2667 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2669 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2671 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2673 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2675 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2677 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2679 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2681 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2683 ; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8
2688 ; AVX2-FCP: # %bb.0:
2689 ; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8
2704 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1]
2705 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3]
2706 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
2707 ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2708 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
2710 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
2711 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2712 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1]
2713 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3]
2714 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
2715 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2716 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
2718 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2719 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2720 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1]
2721 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3]
2722 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2723 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2724 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3]
2726 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
2727 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2728 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1]
2729 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3]
2730 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
2731 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2732 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3]
2734 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
2735 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2736 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1]
2737 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3]
2738 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2739 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2740 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3]
2742 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7]
2743 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2744 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1]
2745 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3]
2746 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
2747 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2748 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3]
2750 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
2751 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2752 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1]
2753 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3]
2754 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
2755 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2757 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
2759 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2762 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1]
2763 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3]
2764 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2765 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2766 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2768 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2769 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2772 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2773 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2774 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2775 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2776 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2778 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2779 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2782 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2783 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2784 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2785 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2787 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2790 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2791 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2792 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2793 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2795 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2798 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2799 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3]
2800 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
2801 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2803 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2806 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1]
2807 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3]
2808 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
2809 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2811 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
2814 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1]
2815 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm9[0,1,1,3]
2816 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
2817 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3]
2819 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
2822 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,0,2,1]
2823 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,1,1,3]
2824 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
2825 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
2827 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
2830 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,0,2,1]
2831 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,1,1,3]
2832 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
2833 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm15[0,2,2,3]
2835 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
2850 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2852 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2854 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2856 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2860 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2862 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2864 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2866 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2868 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2870 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2872 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2874 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2876 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2878 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2880 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2882 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2884 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2886 ; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8
2891 ; AVX512: # %bb.0:
2911 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
2953 ; AVX512-FCP: # %bb.0:
2973 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3015 ; AVX512DQ: # %bb.0:
3035 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3077 ; AVX512DQ-FCP: # %bb.0:
3097 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3139 ; AVX512BW: # %bb.0:
3159 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3201 ; AVX512BW-FCP: # %bb.0:
3221 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3263 ; AVX512DQ-BW: # %bb.0:
3283 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3325 ; AVX512DQ-BW-FCP: # %bb.0:
3345 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3387 %1 = shufflevector <64 x i64> %in.vec0, <64 x i64> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
3388 %interleaved.vec = shufflevector <128 x i64> %1, <128 x i64> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>