Lines Matching +full:0 +full:x1a8
21 ; SSE: # %bb.0:
25 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
32 ; AVX: # %bb.0:
35 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
42 ; AVX2: # %bb.0:
45 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
52 ; AVX2-FP: # %bb.0:
55 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
62 ; AVX2-FCP: # %bb.0:
65 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
72 ; AVX512: # %bb.0:
75 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
82 ; AVX512-FCP: # %bb.0:
85 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
92 ; AVX512DQ: # %bb.0:
95 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
102 ; AVX512DQ-FCP: # %bb.0:
105 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
112 ; AVX512BW: # %bb.0:
115 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
122 ; AVX512BW-FCP: # %bb.0:
125 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
132 ; AVX512DQ-BW: # %bb.0:
135 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
142 ; AVX512DQ-BW-FCP: # %bb.0:
145 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
151 %strided.vec0 = shufflevector <4 x i64> %wide.vec, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
164 ; SSE: # %bb.0:
170 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
172 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
182 ; AVX: # %bb.0:
186 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
194 ; AVX2: # %bb.0:
197 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
198 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
200 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
207 ; AVX2-FP: # %bb.0:
210 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
211 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
213 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
220 ; AVX2-FCP: # %bb.0:
223 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
224 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
226 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
233 ; AVX512: # %bb.0:
236 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
237 ; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
239 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
246 ; AVX512-FCP: # %bb.0:
247 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6]
258 ; AVX512DQ: # %bb.0:
261 ; AVX512DQ-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
262 ; AVX512DQ-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
264 ; AVX512DQ-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
271 ; AVX512DQ-FCP: # %bb.0:
272 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6]
283 ; AVX512BW: # %bb.0:
286 ; AVX512BW-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
287 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
289 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
296 ; AVX512BW-FCP: # %bb.0:
297 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6]
308 ; AVX512DQ-BW: # %bb.0:
311 ; AVX512DQ-BW-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
312 ; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
314 ; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
321 ; AVX512DQ-BW-FCP: # %bb.0:
322 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6]
332 %strided.vec0 = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
344 ; SSE: # %bb.0:
354 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
356 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0]
358 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0]
360 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0]
376 ; AVX: # %bb.0:
381 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
384 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
395 ; AVX2: # %bb.0:
400 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
401 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
402 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
403 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
405 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
407 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
416 ; AVX2-FP: # %bb.0:
421 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
422 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
423 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
424 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
426 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
428 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
437 ; AVX2-FCP: # %bb.0:
442 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
443 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
444 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
445 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
447 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
449 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
458 ; AVX512: # %bb.0:
461 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
471 ; AVX512-FCP: # %bb.0:
474 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
484 ; AVX512DQ: # %bb.0:
487 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
497 ; AVX512DQ-FCP: # %bb.0:
500 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
510 ; AVX512BW: # %bb.0:
513 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
523 ; AVX512BW-FCP: # %bb.0:
526 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
536 ; AVX512DQ-BW: # %bb.0:
539 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
549 ; AVX512DQ-BW-FCP: # %bb.0:
552 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
561 %strided.vec0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
573 ; SSE: # %bb.0:
590 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0]
593 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0]
596 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0]
599 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0]
602 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0]
605 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm10[0]
608 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0]
610 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
613 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0]
630 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
635 ; AVX: # %bb.0:
642 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
645 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm6[0],ymm1[2],ymm6[2]
648 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
651 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm10[0],ymm2[2],ymm10[2]
668 ; AVX2: # %bb.0:
677 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
678 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3]
679 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
680 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
681 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
682 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3]
683 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
684 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3]
686 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3]
688 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
690 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
692 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
705 ; AVX2-FP: # %bb.0:
714 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
715 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3]
716 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
717 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
718 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
719 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3]
720 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
721 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3]
723 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3]
725 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
727 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
729 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
742 ; AVX2-FCP: # %bb.0:
751 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
752 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3]
753 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
754 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
755 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
756 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3]
757 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
758 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3]
760 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3]
762 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
764 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
766 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
779 ; AVX512: # %bb.0:
784 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
799 ; AVX512-FCP: # %bb.0:
804 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
819 ; AVX512DQ: # %bb.0:
824 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
839 ; AVX512DQ-FCP: # %bb.0:
844 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
859 ; AVX512BW: # %bb.0:
864 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
879 ; AVX512BW-FCP: # %bb.0:
884 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
899 ; AVX512DQ-BW: # %bb.0:
904 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
919 ; AVX512DQ-BW-FCP: # %bb.0:
924 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
938 %strided.vec0 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
950 ; SSE: # %bb.0:
968 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
969 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
971 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
973 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
976 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
978 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
979 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
981 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
983 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
984 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
986 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
988 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0]
989 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
991 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
993 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0]
994 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
996 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
998 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
999 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1001 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1004 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0]
1005 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1007 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1011 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1012 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1017 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
1022 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
1027 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0]
1032 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
1037 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1042 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
1047 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1051 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1056 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1058 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1060 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1063 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1065 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1069 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1071 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1080 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1082 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1084 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1086 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1088 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1090 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1092 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1094 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1102 ; AVX: # %bb.0:
1113 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
1114 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1117 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[2],ymm7[2]
1120 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
1123 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
1134 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm15[0],ymm4[2],ymm15[2]
1136 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm13[0],ymm6[2],ymm13[2]
1138 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
1142 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2]
1151 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1165 ; AVX2: # %bb.0:
1181 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2]
1182 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1184 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2]
1186 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2]
1188 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
1190 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm8[0],ymm11[2],ymm8[2]
1192 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
1194 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
1197 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
1199 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3]
1201 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3]
1203 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3]
1205 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1207 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
1209 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
1211 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1212 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
1214 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
1216 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
1218 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
1220 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
1222 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
1224 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
1226 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
1228 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
1230 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
1236 ; AVX2-FP: # %bb.0:
1252 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2]
1253 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1255 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2]
1257 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2]
1259 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
1261 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm8[0],ymm11[2],ymm8[2]
1263 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
1265 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
1268 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
1270 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3]
1272 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3]
1274 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3]
1276 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1278 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
1280 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
1282 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1283 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
1285 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
1287 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
1289 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
1291 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
1293 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
1295 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
1297 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
1299 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
1301 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
1307 ; AVX2-FCP: # %bb.0:
1323 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2]
1324 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1326 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2]
1328 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2]
1330 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
1332 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm8[0],ymm11[2],ymm8[2]
1334 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
1336 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
1339 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
1341 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3]
1343 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3]
1345 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3]
1347 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1349 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
1351 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
1353 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1354 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
1356 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
1358 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
1360 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
1362 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
1364 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
1366 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
1368 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
1370 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
1372 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
1378 ; AVX512: # %bb.0:
1387 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14]
1412 ; AVX512-FCP: # %bb.0:
1421 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14]
1446 ; AVX512DQ: # %bb.0:
1455 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14]
1480 ; AVX512DQ-FCP: # %bb.0:
1489 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14]
1514 ; AVX512BW: # %bb.0:
1523 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14]
1548 ; AVX512BW-FCP: # %bb.0:
1557 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14]
1582 ; AVX512DQ-BW: # %bb.0:
1591 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14]
1616 ; AVX512DQ-BW-FCP: # %bb.0:
1625 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14]
1649 %strided.vec0 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
1661 ; SSE: # %bb.0:
1662 ; SSE-NEXT: subq $664, %rsp # imm = 0x298
1679 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
1680 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1684 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
1685 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1687 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1689 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
1690 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1692 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1694 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
1695 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1697 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1699 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
1700 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1702 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1704 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1705 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1707 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1709 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1710 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1712 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1715 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1716 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1718 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1722 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1723 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1725 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1729 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1730 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1732 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1736 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1737 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1739 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1743 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1744 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1746 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1750 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1751 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1753 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1757 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1758 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1760 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1764 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1765 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1767 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1771 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1772 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1774 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1778 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1779 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1781 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1785 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1786 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1788 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1792 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1793 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1795 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1799 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1800 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1802 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1806 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1807 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1809 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1813 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1814 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1816 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1820 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1821 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1827 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1828 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1830 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1834 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0]
1836 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1840 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
1845 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
1850 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1855 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1860 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
1865 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0]
1870 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1878 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1880 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1882 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1884 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1886 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1888 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1890 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1892 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1894 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1896 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1898 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1900 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1902 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1904 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1906 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1908 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1910 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1912 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1914 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1916 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1918 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1920 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1922 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1924 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1933 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1935 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1939 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1941 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1943 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1945 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1947 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1949 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1951 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1953 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1955 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1957 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1959 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1961 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1963 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1965 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1967 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1969 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1971 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1973 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1975 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1977 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1979 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1981 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1985 ; SSE-NEXT: addq $664, %rsp # imm = 0x298
1989 ; AVX: # %bb.0:
1990 ; AVX-NEXT: subq $424, %rsp # imm = 0x1A8
2001 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
2002 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2005 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2]
2006 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2009 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
2010 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2016 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2017 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
2018 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2020 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2024 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2025 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
2028 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2032 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2033 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
2034 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2036 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2037 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2]
2038 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2040 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2043 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2045 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2049 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2051 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2055 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
2057 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2061 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
2066 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
2071 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2076 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2081 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2086 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2093 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2097 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2103 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2105 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2107 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2109 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2117 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2119 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2121 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2123 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2125 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2127 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2129 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2131 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2133 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2135 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2137 ; AVX-NEXT: addq $424, %rsp # imm = 0x1A8
2142 ; AVX2: # %bb.0:
2143 ; AVX2-NEXT: subq $424, %rsp # imm = 0x1A8
2159 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
2160 ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2162 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2163 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
2166 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2167 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[2],ymm8[2]
2168 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2170 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2171 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2172 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2174 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2175 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2]
2176 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2178 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2179 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
2181 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2182 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
2183 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2185 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2187 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
2189 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2192 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2194 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2197 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2199 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2202 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2204 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2207 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2211 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
2215 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
2219 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm15[0],ymm12[2],ymm15[2]
2223 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
2225 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
2227 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
2229 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
2231 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
2233 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
2235 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2236 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2239 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2241 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
2243 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
2245 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
2247 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3]
2249 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2250 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2252 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2253 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2255 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2256 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2258 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2259 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2261 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3]
2263 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
2265 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
2267 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3]
2269 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2270 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2272 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2273 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2275 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2276 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2278 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2279 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2281 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2282 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2284 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2285 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2287 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2288 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2290 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2291 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2293 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2294 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2296 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2297 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2299 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2300 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
2302 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3]
2304 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
2306 ; AVX2-NEXT: addq $424, %rsp # imm = 0x1A8
2311 ; AVX2-FP: # %bb.0:
2312 ; AVX2-FP-NEXT: subq $424, %rsp # imm = 0x1A8
2328 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
2329 ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2331 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2332 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
2335 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2336 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[2],ymm8[2]
2337 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2339 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2340 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2341 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2343 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2344 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2]
2345 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2347 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2348 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
2350 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2351 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
2352 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2354 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2356 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
2358 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2361 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2363 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2366 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2368 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2371 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2373 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2376 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2380 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
2384 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
2388 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm15[0],ymm12[2],ymm15[2]
2392 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
2394 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
2396 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
2398 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
2400 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
2402 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
2404 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2405 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2408 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2410 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
2412 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
2414 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
2416 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3]
2418 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2419 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2421 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2422 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2424 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2425 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2427 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2428 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2430 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3]
2432 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
2434 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
2436 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3]
2438 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2439 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2441 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2442 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2444 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2445 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2447 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2448 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2450 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2451 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2453 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2454 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2456 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2457 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2459 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2460 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2462 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2463 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2465 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2466 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2468 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2469 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
2471 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3]
2473 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
2475 ; AVX2-FP-NEXT: addq $424, %rsp # imm = 0x1A8
2480 ; AVX2-FCP: # %bb.0:
2481 ; AVX2-FCP-NEXT: subq $424, %rsp # imm = 0x1A8
2497 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
2498 ; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2500 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2501 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
2504 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2505 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[2],ymm8[2]
2506 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2508 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2509 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2510 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2512 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2513 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2]
2514 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2516 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2517 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
2519 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2520 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
2521 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2523 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2525 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
2527 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2530 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2532 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2535 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2537 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2540 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2542 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2545 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2549 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
2553 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
2557 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm15[0],ymm12[2],ymm15[2]
2561 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
2563 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
2565 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
2567 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
2569 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
2571 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
2573 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2574 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2577 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2579 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
2581 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
2583 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
2585 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3]
2587 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2588 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2590 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2591 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2593 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2594 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2596 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2597 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2599 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3]
2601 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
2603 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
2605 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3]
2607 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2608 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2610 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2611 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2613 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2614 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2616 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2617 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2619 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2620 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2622 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2623 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2625 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2626 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2628 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2629 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2631 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2632 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2634 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2635 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2637 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2638 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
2640 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3]
2642 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
2644 ; AVX2-FCP-NEXT: addq $424, %rsp # imm = 0x1A8
2649 ; AVX512: # %bb.0:
2666 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14]
2711 ; AVX512-FCP: # %bb.0:
2728 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14]
2773 ; AVX512DQ: # %bb.0:
2790 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14]
2835 ; AVX512DQ-FCP: # %bb.0:
2852 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14]
2897 ; AVX512BW: # %bb.0:
2914 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14]
2959 ; AVX512BW-FCP: # %bb.0:
2976 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14]
3021 ; AVX512DQ-BW: # %bb.0:
3038 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14]
3083 ; AVX512DQ-BW-FCP: # %bb.0:
3100 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14]
3144 %strided.vec0 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>