Lines Matching +full:0 +full:x2e8
20 ; SSE: # %bb.0:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
25 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0]
34 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
36 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
39 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
40 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
46 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7]
74 ; AVX: # %bb.0:
75 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
76 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
77 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
79 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
87 ; AVX-NEXT: vpextrw $0, %xmm1, (%rsi)
88 ; AVX-NEXT: vpextrw $0, %xmm2, (%rdx)
89 ; AVX-NEXT: vpextrw $0, %xmm3, (%rcx)
90 ; AVX-NEXT: vpextrw $0, %xmm4, (%r8)
91 ; AVX-NEXT: vpextrw $0, %xmm5, (%r9)
92 ; AVX-NEXT: vpextrw $0, %xmm6, (%r11)
93 ; AVX-NEXT: vpextrw $0, %xmm7, (%r10)
94 ; AVX-NEXT: vpextrw $0, %xmm0, (%rax)
98 ; AVX2: # %bb.0:
99 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
100 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
101 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
103 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
111 ; AVX2-NEXT: vpextrw $0, %xmm1, (%rsi)
112 ; AVX2-NEXT: vpextrw $0, %xmm2, (%rdx)
113 ; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx)
114 ; AVX2-NEXT: vpextrw $0, %xmm4, (%r8)
115 ; AVX2-NEXT: vpextrw $0, %xmm5, (%r9)
116 ; AVX2-NEXT: vpextrw $0, %xmm6, (%r11)
117 ; AVX2-NEXT: vpextrw $0, %xmm7, (%r10)
118 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rax)
122 ; AVX2-FP: # %bb.0:
123 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
124 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
125 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
127 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
135 ; AVX2-FP-NEXT: vpextrw $0, %xmm1, (%rsi)
136 ; AVX2-FP-NEXT: vpextrw $0, %xmm2, (%rdx)
137 ; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx)
138 ; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8)
139 ; AVX2-FP-NEXT: vpextrw $0, %xmm5, (%r9)
140 ; AVX2-FP-NEXT: vpextrw $0, %xmm6, (%r11)
141 ; AVX2-FP-NEXT: vpextrw $0, %xmm7, (%r10)
142 ; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%rax)
146 ; AVX2-FCP: # %bb.0:
147 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
148 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
149 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
151 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
159 ; AVX2-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
160 ; AVX2-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
161 ; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
162 ; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
163 ; AVX2-FCP-NEXT: vpextrw $0, %xmm5, (%r9)
164 ; AVX2-FCP-NEXT: vpextrw $0, %xmm6, (%r11)
165 ; AVX2-FCP-NEXT: vpextrw $0, %xmm7, (%r10)
166 ; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%rax)
170 ; AVX512: # %bb.0:
171 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
172 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
173 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
183 ; AVX512-NEXT: vpextrw $0, %xmm1, (%rdx)
184 ; AVX512-NEXT: vpextrw $0, %xmm2, (%rcx)
185 ; AVX512-NEXT: vpextrw $0, %xmm3, (%r8)
186 ; AVX512-NEXT: vpextrw $0, %xmm4, (%r9)
187 ; AVX512-NEXT: vpextrw $0, %xmm5, (%r11)
188 ; AVX512-NEXT: vpextrw $0, %xmm6, (%r10)
189 ; AVX512-NEXT: vpextrw $0, %xmm7, (%rax)
193 ; AVX512-FCP: # %bb.0:
194 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
195 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
196 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
206 ; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rdx)
207 ; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rcx)
208 ; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%r8)
209 ; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r9)
210 ; AVX512-FCP-NEXT: vpextrw $0, %xmm5, (%r11)
211 ; AVX512-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
212 ; AVX512-FCP-NEXT: vpextrw $0, %xmm7, (%rax)
216 ; AVX512DQ: # %bb.0:
217 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
218 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
219 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
229 ; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rdx)
230 ; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rcx)
231 ; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%r8)
232 ; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r9)
233 ; AVX512DQ-NEXT: vpextrw $0, %xmm5, (%r11)
234 ; AVX512DQ-NEXT: vpextrw $0, %xmm6, (%r10)
235 ; AVX512DQ-NEXT: vpextrw $0, %xmm7, (%rax)
239 ; AVX512DQ-FCP: # %bb.0:
240 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
241 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
242 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
252 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rdx)
253 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rcx)
254 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%r8)
255 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r9)
256 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm5, (%r11)
257 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
258 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm7, (%rax)
262 ; AVX512BW: # %bb.0:
263 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
264 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
265 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
275 ; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rdx)
276 ; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rcx)
277 ; AVX512BW-NEXT: vpextrw $0, %xmm3, (%r8)
278 ; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r9)
279 ; AVX512BW-NEXT: vpextrw $0, %xmm5, (%r11)
280 ; AVX512BW-NEXT: vpextrw $0, %xmm6, (%r10)
281 ; AVX512BW-NEXT: vpextrw $0, %xmm7, (%rax)
285 ; AVX512BW-FCP: # %bb.0:
286 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
287 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
288 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
298 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rdx)
299 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rcx)
300 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%r8)
301 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r9)
302 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm5, (%r11)
303 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
304 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm7, (%rax)
308 ; AVX512DQ-BW: # %bb.0:
309 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
310 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
311 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
321 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rdx)
322 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rcx)
323 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%r8)
324 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r9)
325 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm5, (%r11)
326 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm6, (%r10)
327 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm7, (%rax)
331 ; AVX512DQ-BW-FCP: # %bb.0:
332 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
333 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
334 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
344 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rdx)
345 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rcx)
346 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%r8)
347 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r9)
348 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm5, (%r11)
349 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
350 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm7, (%rax)
353 %strided.vec0 = shufflevector <16 x i8> %wide.vec, <16 x i8> poison, <2 x i32> <i32 0, i32 8>
374 ; SSE: # %bb.0:
375 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
376 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
377 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
380 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0]
391 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
393 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
395 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
401 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
403 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
410 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,2,2,3]
411 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
416 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
429 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7]
434 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,0,2,3,4,5,6,7]
447 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
472 ; AVX: # %bb.0:
473 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
474 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
475 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
476 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
481 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
482 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
485 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
486 ; AVX-NEXT: vmovd {{.*#+}} xmm4 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
489 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
490 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
493 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
494 ; AVX-NEXT: vmovd {{.*#+}} xmm6 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
497 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
498 ; AVX-NEXT: vmovd {{.*#+}} xmm7 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
501 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
502 ; AVX-NEXT: vmovd {{.*#+}} xmm8 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
505 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
506 ; AVX-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
509 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
521 ; AVX2: # %bb.0:
522 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
523 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
524 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
525 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
530 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
534 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
538 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
542 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
546 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
550 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
554 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
558 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
570 ; AVX2-FP: # %bb.0:
571 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
572 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
573 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
574 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
579 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
583 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
587 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
591 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
595 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
599 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
603 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
607 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
619 ; AVX2-FCP: # %bb.0:
620 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
621 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
622 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
623 ; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
628 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
632 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
636 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
640 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
644 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
648 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
652 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
656 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
668 ; AVX512: # %bb.0:
669 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
670 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
671 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
692 ; AVX512-FCP: # %bb.0:
693 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
694 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
695 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
716 ; AVX512DQ: # %bb.0:
717 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
718 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
719 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
740 ; AVX512DQ-FCP: # %bb.0:
741 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
742 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
743 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
764 ; AVX512BW: # %bb.0:
765 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
766 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
767 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
788 ; AVX512BW-FCP: # %bb.0:
789 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
790 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
791 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
812 ; AVX512DQ-BW: # %bb.0:
813 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
814 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
815 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
836 ; AVX512DQ-BW-FCP: # %bb.0:
837 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
838 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
839 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
859 %strided.vec0 = shufflevector <32 x i8> %wide.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
880 ; SSE: # %bb.0:
886 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0]
899 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
903 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
905 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
907 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
909 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
914 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
916 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
917 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
918 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
925 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
927 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
929 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
931 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3]
932 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
937 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
939 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
940 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
942 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
943 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3]
948 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
949 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
952 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3]
953 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
958 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3]
964 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0]
965 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5]
970 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0]
971 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
976 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
991 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
993 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7]
998 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7]
1002 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3]
1003 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6]
1007 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3]
1008 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7]
1013 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
1014 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
1016 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1017 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1026 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1029 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3]
1032 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1033 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
1035 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3]
1040 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
1041 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7]
1049 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7]
1053 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7]
1058 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
1071 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1072 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1073 ; SSE-NEXT: # xmm0 = mem[0,3,2,3]
1075 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1080 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1082 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1084 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1090 ; AVX: # %bb.0:
1091 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1092 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
1093 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
1094 ; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
1101 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1102 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1105 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1106 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
1107 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
1110 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1111 ; AVX-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1114 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1115 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
1116 ; AVX-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
1119 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1120 ; AVX-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1123 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1124 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
1125 ; AVX-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
1128 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1129 ; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1132 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1133 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
1134 ; AVX-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
1137 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1138 ; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1141 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1142 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
1143 ; AVX-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
1146 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1147 ; AVX-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1150 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1151 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
1152 ; AVX-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
1155 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1156 ; AVX-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1159 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
1160 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
1161 ; AVX-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
1164 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1165 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1168 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1169 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
1181 ; AVX2: # %bb.0:
1182 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1183 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
1184 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
1189 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
1192 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1193 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
1196 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1197 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
1201 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1205 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1206 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
1210 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1214 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1215 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
1219 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1223 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1224 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
1228 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1232 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1233 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
1237 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1241 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1242 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
1246 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1250 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
1251 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
1255 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1259 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1260 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
1272 ; AVX2-FP: # %bb.0:
1273 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1274 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1275 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1280 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
1283 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1284 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
1287 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1288 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
1292 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1296 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1297 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
1301 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1305 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1306 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
1310 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1314 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1315 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
1319 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1323 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1324 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
1328 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1332 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1333 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
1337 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1341 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
1342 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
1346 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1350 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1351 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
1363 ; AVX2-FCP: # %bb.0:
1364 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1365 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1366 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1369 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
1371 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
1374 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
1376 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
1379 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
1381 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3]
1384 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
1386 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3]
1389 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
1391 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
1397 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
1400 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
1403 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
1406 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
1419 ; AVX512: # %bb.0:
1420 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1421 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
1422 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
1443 ; AVX512-FCP: # %bb.0:
1444 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1445 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1446 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1467 ; AVX512DQ: # %bb.0:
1468 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1469 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
1470 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
1491 ; AVX512DQ-FCP: # %bb.0:
1492 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1493 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1494 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1515 ; AVX512BW: # %bb.0:
1516 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1517 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1518 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1539 ; AVX512BW-FCP: # %bb.0:
1540 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1541 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1542 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1563 ; AVX512DQ-BW: # %bb.0:
1564 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1565 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1566 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1587 ; AVX512DQ-BW-FCP: # %bb.0:
1588 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1589 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1590 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1610 %strided.vec0 = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
1631 ; SSE: # %bb.0:
1632 ; SSE-NEXT: subq $328, %rsp # imm = 0x148
1641 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0]
1667 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
1668 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1672 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1674 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
1675 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1676 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
1678 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535]
1682 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1685 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1686 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1687 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1688 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1689 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1694 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1697 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1698 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1699 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1700 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
1702 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,3]
1703 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
1706 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1709 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1710 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
1711 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1712 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
1713 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,1,3]
1718 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1720 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1723 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1724 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
1725 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1726 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1728 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1729 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
1730 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
1733 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1736 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1737 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1738 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1739 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
1740 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,1,3]
1745 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1749 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1751 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
1752 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1753 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1756 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1757 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
1760 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1762 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1763 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535]
1767 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
1774 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1775 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1778 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3]
1779 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
1783 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1785 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1786 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1791 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1793 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1794 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0]
1795 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,5]
1799 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1801 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1802 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0]
1803 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7]
1808 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
1809 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1811 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1812 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0]
1813 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,5]
1817 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1819 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1820 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0]
1821 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7]
1825 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1827 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1828 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3]
1829 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7]
1833 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1835 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,2,2,3]
1842 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1843 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1847 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1859 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1861 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1866 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1867 ; SSE-NEXT: # xmm1 = mem[0,1,3,3]
1876 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1877 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1879 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7]
1883 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1885 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1886 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
1890 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1891 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
1892 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1893 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
1897 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1898 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
1899 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1900 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1905 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1906 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1907 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
1908 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1909 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
1913 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1914 ; SSE-NEXT: # xmm8 = mem[0,1,1,3]
1915 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7]
1919 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1921 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7]
1926 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7]
1932 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
1933 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1934 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1936 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1937 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1939 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1941 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1948 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1949 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1952 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1953 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,3]
1956 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1957 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1959 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1960 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,1,3]
1965 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
1966 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1967 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1970 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,1,2,2]
1973 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1974 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1976 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1977 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,1,3]
1981 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1982 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
1985 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1997 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1]
1998 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,1,4,5,6,7]
2002 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2007 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2008 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
2012 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2013 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
2018 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2019 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2020 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
2024 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,7,6,7]
2028 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,3,1,4,5,6,7]
2038 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1]
2039 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2041 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2047 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2051 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2054 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
2055 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2061 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,3,3]
2070 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
2071 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2073 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2075 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2077 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2080 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2082 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2084 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2086 ; SSE-NEXT: addq $328, %rsp # imm = 0x148
2090 ; AVX: # %bb.0:
2091 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
2096 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2097 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
2102 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
2103 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm0[6,7]
2104 ; AVX-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
2111 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2112 ; AVX-NEXT: vmovd {{.*#+}} xmm10 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2115 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2116 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
2117 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
2118 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2119 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
2122 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2123 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm10 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
2126 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2127 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,5],xmm9[6,7]
2128 ; AVX-NEXT: vmovd {{.*#+}} xmm10 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
2131 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2132 ; AVX-NEXT: vmovd {{.*#+}} xmm11 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2135 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2136 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
2137 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm9[4,5,6,7]
2138 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2139 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
2142 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2143 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm11 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
2146 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2147 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5],xmm10[6,7]
2148 ; AVX-NEXT: vmovd {{.*#+}} xmm11 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
2151 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2152 ; AVX-NEXT: vmovd {{.*#+}} xmm12 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2156 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2157 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5,6,7]
2158 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
2159 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
2162 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2163 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
2166 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2167 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
2168 ; AVX-NEXT: vmovd {{.*#+}} xmm12 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
2171 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2172 ; AVX-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2175 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2176 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3],xmm13[4,5,6,7]
2177 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
2178 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
2181 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2182 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
2185 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2186 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5],xmm12[6,7]
2187 ; AVX-NEXT: vmovd {{.*#+}} xmm13 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
2190 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2191 ; AVX-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2194 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2195 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7]
2196 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7]
2197 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
2200 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2201 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
2204 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2205 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm13[6,7]
2206 ; AVX-NEXT: vmovd {{.*#+}} xmm14 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
2209 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2210 ; AVX-NEXT: vmovd {{.*#+}} xmm15 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2213 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2214 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7]
2215 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4,5,6,7]
2216 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
2219 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2220 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
2223 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2224 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5],xmm0[6,7]
2225 ; AVX-NEXT: vmovd {{.*#+}} xmm14 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
2228 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2229 ; AVX-NEXT: vmovd {{.*#+}} xmm15 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2232 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2233 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7]
2234 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2235 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
2238 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
2239 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
2242 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
2243 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
2244 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
2247 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
2248 ; AVX-NEXT: vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2251 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2252 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
2253 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2254 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2256 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2261 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2263 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2265 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2270 ; AVX2: # %bb.0:
2272 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
2276 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2278 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
2282 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
2283 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
2288 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
2291 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2292 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
2295 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2296 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
2297 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
2298 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2302 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2306 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2307 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
2311 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2315 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2316 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
2317 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3]
2318 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2322 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2326 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2327 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
2331 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2336 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2337 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
2338 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2342 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2346 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2347 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
2351 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2355 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2356 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3]
2357 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
2361 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2365 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2366 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3]
2370 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2374 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2375 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3]
2376 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
2380 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2384 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2385 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
2389 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2393 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2394 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
2395 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
2399 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2403 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2404 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3]
2408 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2412 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2413 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
2414 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2418 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
2422 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2423 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
2427 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2431 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2432 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
2433 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2434 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2436 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2441 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2443 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2445 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2450 ; AVX2-FP: # %bb.0:
2452 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
2456 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2458 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
2462 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
2463 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
2468 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
2471 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2472 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
2475 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2476 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
2477 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
2478 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2482 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2486 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2487 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
2491 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2495 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2496 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
2497 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3]
2498 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2502 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2506 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2507 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
2511 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2516 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2517 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
2518 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2522 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2526 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2527 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
2531 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2535 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2536 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3]
2537 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
2541 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2545 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2546 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3]
2550 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2554 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2555 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3]
2556 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
2560 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2564 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2565 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
2569 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2573 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2574 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
2575 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
2579 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2583 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2584 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3]
2588 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2592 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2593 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
2594 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2598 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
2602 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2603 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
2607 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2611 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2612 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
2613 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2614 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2616 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2621 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2623 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2625 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2630 ; AVX2-FCP: # %bb.0:
2635 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
2637 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
2640 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
2642 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[3]
2644 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
2646 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
2647 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
2648 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2,3]
2649 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2652 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
2655 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
2656 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
2659 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
2661 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
2663 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
2664 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm9[2,3]
2669 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm6[3]
2672 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
2674 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
2675 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2681 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3]
2684 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
2685 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
2686 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3]
2687 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3]
2690 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
2693 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3]
2694 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
2697 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
2700 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3]
2701 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3]
2704 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
2707 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
2708 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
2709 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2715 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2717 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2719 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2725 ; AVX512: # %bb.0:
2726 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2727 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
2728 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
2729 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
2734 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2735 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
2740 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
2741 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
2744 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
2745 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
2748 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
2749 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
2752 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
2753 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
2756 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
2757 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
2760 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
2761 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
2764 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
2765 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
2768 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
2769 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
2772 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
2773 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
2776 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2777 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
2780 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
2781 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
2784 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2785 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
2788 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2789 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
2792 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
2793 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
2796 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2797 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
2800 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2801 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
2804 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2805 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
2808 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2809 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
2812 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2813 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
2816 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
2817 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
2820 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2821 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
2824 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2825 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2828 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2841 ; AVX512-FCP: # %bb.0:
2842 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2843 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2844 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2847 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
2851 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
2853 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2856 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2861 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
2864 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2869 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
2872 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
2877 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
2880 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
2886 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
2889 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2892 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
2895 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
2898 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
2901 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
2904 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
2907 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
2920 ; AVX512DQ: # %bb.0:
2921 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2922 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
2923 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
2924 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
2929 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2930 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
2935 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
2936 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
2939 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
2940 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
2943 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
2944 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
2947 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
2948 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
2951 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
2952 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
2955 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
2956 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
2959 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
2960 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
2963 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
2964 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
2967 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
2968 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
2971 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2972 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
2975 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
2976 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
2979 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2980 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
2983 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2984 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
2987 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
2988 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
2991 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2992 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
2995 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
2996 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
2999 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
3000 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
3003 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
3004 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
3007 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
3008 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
3011 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
3012 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
3015 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3016 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
3019 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3020 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
3023 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3036 ; AVX512DQ-FCP: # %bb.0:
3037 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3038 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3039 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3042 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
3046 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
3048 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
3051 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3056 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
3059 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
3064 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
3067 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
3072 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
3075 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
3081 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
3084 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
3087 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
3090 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
3093 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
3096 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
3099 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
3102 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
3115 ; AVX512BW: # %bb.0:
3116 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3117 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3118 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
3119 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
3124 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3125 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
3130 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
3131 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
3134 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
3138 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
3142 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
3143 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
3146 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
3150 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
3154 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
3155 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
3158 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
3162 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
3166 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
3167 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
3170 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
3174 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
3178 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
3179 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
3182 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
3186 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
3190 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
3191 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
3194 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
3198 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
3202 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
3203 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
3206 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
3210 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3214 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3215 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
3218 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3231 ; AVX512BW-FCP: # %bb.0:
3232 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3233 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3234 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3237 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
3241 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
3243 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
3246 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3251 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
3254 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
3259 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
3262 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
3267 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
3270 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
3276 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
3279 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
3282 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
3285 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
3288 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
3291 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
3294 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
3297 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
3310 ; AVX512DQ-BW: # %bb.0:
3311 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3312 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3313 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
3314 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
3319 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3320 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
3325 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
3326 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
3329 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
3333 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
3337 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
3338 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
3341 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
3345 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
3349 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
3350 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
3353 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
3357 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
3361 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
3362 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
3365 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
3369 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
3373 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
3374 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
3377 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
3381 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
3385 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
3386 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
3389 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
3393 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
3397 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
3398 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
3401 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
3405 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3409 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3410 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
3413 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3426 ; AVX512DQ-BW-FCP: # %bb.0:
3427 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3428 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3429 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3432 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
3436 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
3438 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
3441 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3446 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
3449 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
3454 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
3457 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
3462 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
3465 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
3471 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
3474 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
3477 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
3480 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
3483 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
3486 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
3489 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
3492 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
3504 %strided.vec0 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
3525 ; SSE: # %bb.0:
3526 ; SSE-NEXT: subq $904, %rsp # imm = 0x388
3528 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3530 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3532 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3538 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3542 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0]
3566 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3570 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
3571 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3589 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3601 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3]
3602 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3605 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3608 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3609 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3610 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3611 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3613 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3614 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535]
3617 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3620 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3621 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3622 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3623 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
3624 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3630 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3633 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3634 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
3635 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3636 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3638 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3639 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,3]
3640 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535]
3644 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3647 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3648 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
3649 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3650 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3651 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3652 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3]
3657 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3659 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3662 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3663 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
3664 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3665 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3667 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3668 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
3669 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0]
3673 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3676 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3677 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
3678 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3679 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3680 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3681 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3]
3686 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3689 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3690 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3691 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3692 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
3694 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3695 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535]
3698 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3701 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3702 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
3703 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3704 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
3705 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3712 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
3713 ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3717 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3718 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
3719 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3721 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
3725 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3728 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3729 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
3730 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3731 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
3732 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3737 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3740 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3741 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
3742 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3744 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
3746 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,3]
3749 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3752 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3753 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
3754 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3755 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3756 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3757 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3]
3762 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
3763 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3766 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3767 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
3768 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3769 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3771 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3772 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
3775 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3778 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3779 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
3780 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3781 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
3782 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,1,3]
3786 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3790 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3792 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
3793 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3794 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3797 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3798 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3799 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3801 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
3803 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3807 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
3814 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
3815 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3817 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3819 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3]
3821 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
3825 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3827 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3]
3833 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3835 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3836 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0]
3837 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5]
3841 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3843 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3844 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0]
3845 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7]
3850 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
3851 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3853 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3854 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0]
3855 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5]
3859 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3861 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3862 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0]
3863 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7]
3867 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3869 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3870 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3]
3871 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7]
3875 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3877 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3878 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3]
3885 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
3886 ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3890 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3891 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
3895 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3897 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3898 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3903 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3905 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3906 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0]
3907 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5]
3911 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3913 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3914 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0]
3915 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7]
3920 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
3921 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3923 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3924 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0]
3925 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5]
3929 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3931 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3932 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0]
3933 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7]
3937 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3939 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3940 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
3941 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3945 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3947 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3948 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3955 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3956 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3957 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3961 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3967 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3968 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3974 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3976 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3977 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3982 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3983 ; SSE-NEXT: # xmm1 = mem[0,1,3,3]
3986 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3993 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3994 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3998 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4005 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4011 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4013 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4018 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4019 ; SSE-NEXT: # xmm1 = mem[0,1,3,3]
4028 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4029 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4031 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4032 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4037 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4038 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
4042 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4043 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
4044 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4045 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
4049 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4050 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
4051 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4052 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
4057 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4058 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4059 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
4060 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4061 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
4065 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4066 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
4067 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4068 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
4072 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4074 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4075 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
4079 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4081 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4082 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
4088 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
4092 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4096 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4098 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4099 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
4103 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4104 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
4105 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4106 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
4110 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4111 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
4112 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4113 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
4118 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4119 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4120 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
4121 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4122 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
4126 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4127 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
4128 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4129 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
4133 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4135 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4136 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
4140 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4142 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
4148 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1]
4149 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4150 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4152 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4153 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4155 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4157 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4164 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4165 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4168 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4169 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
4172 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4173 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4175 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4176 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
4181 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4182 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4183 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4186 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4187 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
4190 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4191 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4193 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4194 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
4198 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4199 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4201 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4202 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4204 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4206 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4215 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
4216 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4217 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4219 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4220 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4222 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4224 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4231 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4232 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4235 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4236 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
4239 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4240 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4242 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4243 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
4248 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4249 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4250 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4253 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,2]
4256 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4257 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4259 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4260 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
4264 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4265 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4267 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4268 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4270 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4280 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
4281 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4282 ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4286 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4291 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4292 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
4296 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4297 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
4302 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4303 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4304 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
4308 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4309 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
4313 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4314 ; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7]
4318 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4325 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
4327 ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4331 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4336 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4337 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
4341 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4342 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
4347 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4348 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4349 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
4353 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4354 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
4358 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4359 ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
4369 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
4370 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4374 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4380 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4381 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4387 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4390 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4395 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,3,3]
4398 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4405 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
4406 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4408 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4414 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4418 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4421 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
4422 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4426 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4428 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4429 ; SSE-NEXT: # xmm0 = mem[0,1,3,3]
4430 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4438 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1]
4439 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4441 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4443 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4445 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4447 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4449 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4451 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4453 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4457 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4460 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4463 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4466 ; SSE-NEXT: addq $904, %rsp # imm = 0x388
4470 ; AVX: # %bb.0:
4471 ; AVX-NEXT: subq $360, %rsp # imm = 0x168
4472 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
4475 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4478 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4479 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4480 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
4482 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4486 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4487 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
4488 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm0[6,7]
4489 ; AVX-NEXT: vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
4491 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4493 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4495 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4497 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4500 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
4501 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4504 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
4505 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3],xmm11[4,5,6,7]
4506 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7]
4511 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4513 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
4515 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4518 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4520 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
4521 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5],xmm13[6,7]
4523 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4526 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4528 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
4531 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4534 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4535 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4536 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
4539 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4540 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
4541 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4542 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
4545 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4546 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
4547 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4550 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4551 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
4552 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
4553 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4555 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4557 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4558 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4559 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4561 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4563 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
4564 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6,7]
4565 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4568 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4570 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4571 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4573 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4575 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4576 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
4577 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4579 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4581 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4584 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4585 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
4588 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4589 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4590 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4591 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
4592 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4594 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4596 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4597 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
4599 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4601 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4602 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
4603 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
4606 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4607 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4609 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4611 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
4612 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
4613 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
4616 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
4617 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4619 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4621 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4622 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
4623 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4625 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4627 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4628 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4630 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4632 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4633 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
4636 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4637 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4638 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4639 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
4642 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4643 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
4644 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4647 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4648 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
4649 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
4650 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4652 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4654 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4655 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4656 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4660 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
4661 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
4662 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
4665 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4667 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
4670 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4671 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
4673 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4675 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4676 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4678 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4680 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4681 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
4684 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4685 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4686 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4687 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
4688 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4690 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4692 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4693 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
4694 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4696 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4698 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4699 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
4700 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
4702 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4704 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4705 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4706 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4709 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
4710 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
4711 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
4714 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4716 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
4717 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4719 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4721 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4722 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
4723 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4726 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4729 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4730 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
4733 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4734 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4735 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4736 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
4738 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4740 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4741 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
4744 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4745 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
4746 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
4747 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4749 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4751 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4752 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4753 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4755 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4757 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
4758 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
4759 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
4761 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4763 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
4764 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4766 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4768 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4769 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
4770 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4772 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4774 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4775 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4777 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4779 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4780 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
4783 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4784 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4785 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4786 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
4787 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4790 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4791 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
4792 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4794 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4796 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4797 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
4798 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
4799 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4802 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4803 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4806 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
4807 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
4808 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
4813 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
4816 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4817 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
4819 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4821 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4822 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4824 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4826 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4827 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
4830 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4831 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4832 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4833 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
4834 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4836 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4838 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4839 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
4840 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4842 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4844 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4845 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
4846 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
4848 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4850 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4851 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4852 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4854 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4856 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
4857 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
4858 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
4861 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
4862 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4865 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4866 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
4869 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4872 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4873 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
4876 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4877 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4878 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4880 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4882 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4884 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4886 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4888 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4889 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4891 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4892 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4894 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4896 ; AVX-NEXT: addq $360, %rsp # imm = 0x168
4901 ; AVX2: # %bb.0:
4902 ; AVX2-NEXT: subq $360, %rsp # imm = 0x168
4904 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4905 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
4909 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4910 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
4912 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4913 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
4917 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4918 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
4919 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1,2],xmm0[3]
4921 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4924 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4926 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4927 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
4930 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
4931 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
4933 ; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4935 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
4936 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3]
4937 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
4942 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4944 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
4947 ; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4949 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4951 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
4954 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm13[7]
4956 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4959 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4961 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
4963 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4967 ; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4968 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4971 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
4972 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
4973 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
4974 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4976 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4979 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4981 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4984 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4985 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
4987 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4989 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4991 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4994 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4996 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
4997 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
4998 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5001 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5003 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5005 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5007 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5010 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5011 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5013 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5015 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5016 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5019 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5022 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5023 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5024 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5025 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5027 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5029 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5031 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5034 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5036 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5037 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5042 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5044 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5047 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5048 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5049 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5053 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5054 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5057 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5060 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5061 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5063 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5065 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5066 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5068 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5070 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5073 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5074 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5075 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5076 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5078 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5081 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5083 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5087 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5088 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5090 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5093 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5095 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5097 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5099 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5100 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5101 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5105 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5107 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5109 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5112 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5114 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5116 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5117 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5119 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5121 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5124 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5125 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5126 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5127 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5129 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5131 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5133 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5135 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5138 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5139 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5142 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5144 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5148 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5149 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5150 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5153 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5155 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5156 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5159 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5162 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5163 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5166 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5167 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5169 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5171 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5174 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5175 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5176 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5177 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5180 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5182 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5186 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5188 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5189 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5191 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5194 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5196 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5198 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5200 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5201 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5202 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5204 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5206 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5207 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5209 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5211 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5214 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5216 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5218 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5220 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5222 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5225 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5226 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5227 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5228 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5230 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5233 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5236 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5238 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5239 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5241 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5243 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5245 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5247 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5249 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5251 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5252 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5253 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5257 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5260 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5263 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5264 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5266 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5268 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5269 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5272 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5275 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5276 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5277 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5278 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5280 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5282 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5284 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5286 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5289 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5290 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5293 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5295 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5297 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5300 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5301 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5302 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5304 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5306 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5307 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5309 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5311 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5314 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5317 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5320 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5323 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5324 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5325 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5326 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5328 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5330 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5332 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5334 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5336 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5337 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5339 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5340 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5342 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5344 ; AVX2-NEXT: addq $360, %rsp # imm = 0x168
5349 ; AVX2-FP: # %bb.0:
5350 ; AVX2-FP-NEXT: subq $360, %rsp # imm = 0x168
5352 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5353 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
5357 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5358 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
5360 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5361 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
5365 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5366 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
5367 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1,2],xmm0[3]
5369 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5372 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5374 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5375 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
5378 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
5379 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
5381 ; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5383 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
5384 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3]
5385 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
5390 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5392 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
5395 ; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5397 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5399 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
5402 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm13[7]
5404 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5407 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5409 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
5411 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5415 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5416 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
5419 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
5420 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
5421 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
5422 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5424 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5427 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5429 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5432 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5433 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5435 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5437 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5439 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5442 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5444 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
5445 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
5446 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5449 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5451 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5453 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5455 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5458 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5459 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5461 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5463 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5464 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5467 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5470 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5471 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5472 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5473 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5475 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5477 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5479 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5482 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5484 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5485 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5490 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5492 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5495 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5496 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5497 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5501 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5502 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5505 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5508 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5509 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5511 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5513 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5514 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5516 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5518 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5521 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5522 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5523 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5524 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5526 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5529 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5531 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5535 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5536 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5538 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5541 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5543 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5545 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5547 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5548 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5549 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5553 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5555 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5557 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5560 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5562 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5564 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5565 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5567 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5569 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5572 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5573 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5574 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5575 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5577 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5579 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5581 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5583 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5586 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5587 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5590 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5592 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5596 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5597 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5598 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5601 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5603 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5604 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5607 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5610 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5611 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5614 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5615 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5617 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5619 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5622 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5623 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5624 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5625 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5628 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5630 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5634 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5636 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5637 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5639 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5642 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5644 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5646 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5648 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5649 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5650 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5652 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5654 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5655 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5657 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5659 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5662 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5664 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5666 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5668 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5670 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5673 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5674 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5675 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5676 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5678 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5681 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5684 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5686 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5687 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5689 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5691 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5693 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5695 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5697 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5699 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5700 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5701 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5705 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5708 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5711 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5712 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5714 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5716 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5717 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5720 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5723 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5724 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5725 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5726 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5728 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5730 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5732 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5734 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5737 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5738 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
5741 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5743 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5745 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5748 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5749 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
5750 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
5752 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5754 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
5755 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5757 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5759 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5762 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5765 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5768 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5771 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5772 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5773 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5774 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5776 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5778 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5780 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5782 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5784 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5785 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5787 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5788 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5790 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5792 ; AVX2-FP-NEXT: addq $360, %rsp # imm = 0x168
5797 ; AVX2-FCP: # %bb.0:
5800 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5802 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5806 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5808 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5809 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
5814 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5819 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
5821 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5822 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
5824 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
5826 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
5827 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm1[3]
5832 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
5833 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
5834 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
5835 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
5836 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
5837 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5840 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5843 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7]
5844 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5847 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3]
5850 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3]
5851 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
5852 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5853 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5856 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5859 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7]
5860 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5863 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3]
5866 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3]
5867 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
5868 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5869 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5872 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5875 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
5876 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5879 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
5881 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
5884 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3]
5885 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5886 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5888 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
5890 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
5892 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7]
5894 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
5897 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7]
5898 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
5901 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
5902 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload
5903 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
5904 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm13[3]
5907 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
5908 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
5909 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3]
5910 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3]
5911 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5914 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5917 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
5918 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5921 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
5924 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3]
5925 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
5926 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5929 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
5932 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7]
5933 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
5936 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
5939 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm14[1],xmm4[2,3]
5940 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
5941 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5944 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
5947 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
5948 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
5951 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
5954 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
5955 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
5956 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
5957 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5959 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5961 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5965 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5967 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5969 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5976 ; AVX512: # %bb.0:
5981 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
5985 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5988 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
5992 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
5994 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5999 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7]
6000 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6006 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
6012 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
6013 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
6015 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3]
6016 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
6020 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
6024 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
6026 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
6031 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
6033 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
6034 ; AVX512-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
6037 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
6040 ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6044 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6047 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6048 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6053 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6058 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
6059 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6062 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6063 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6065 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
6070 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6071 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
6076 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
6079 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
6080 ; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
6084 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
6085 ; AVX512-NEXT: vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6089 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
6092 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
6093 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
6098 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
6102 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
6103 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
6106 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6107 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6109 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
6113 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6114 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
6118 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
6121 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
6122 ; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
6126 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
6127 ; AVX512-NEXT: vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6131 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
6134 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
6135 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
6140 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
6144 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
6145 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
6148 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6149 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6151 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
6156 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6157 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
6161 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
6164 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
6165 ; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
6170 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
6171 ; AVX512-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6175 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
6178 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
6179 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6182 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6187 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6188 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6191 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6192 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6194 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
6198 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6199 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
6204 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
6207 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
6208 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
6213 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6214 ; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6218 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
6221 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
6222 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6226 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6231 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6232 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6235 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6236 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6238 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
6242 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6243 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
6248 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6251 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
6252 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
6255 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
6256 ; AVX512-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6260 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
6263 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
6264 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6268 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6273 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6274 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6277 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6278 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6279 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
6283 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6284 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
6288 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6291 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
6292 ; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
6295 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
6296 ; AVX512-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6299 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
6302 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
6303 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
6306 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
6309 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6310 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
6313 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6314 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6320 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6322 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6324 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6330 ; AVX512-FCP: # %bb.0:
6332 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
6333 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
6337 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
6341 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7]
6342 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
6346 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
6350 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
6351 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
6354 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
6357 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
6358 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
6360 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6361 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6365 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
6368 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
6369 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
6372 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
6375 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6376 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6384 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
6391 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
6392 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
6399 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
6402 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6403 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6410 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6416 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
6417 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6422 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6425 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6426 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6432 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
6437 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
6438 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6441 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
6443 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
6444 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
6447 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
6448 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7]
6451 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
6454 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
6455 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
6458 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
6461 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
6462 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
6467 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
6472 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
6473 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
6478 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
6481 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
6482 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
6487 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7]
6491 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
6492 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
6495 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
6498 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
6499 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
6505 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6507 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6509 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6515 ; AVX512DQ: # %bb.0:
6520 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
6524 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6527 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
6531 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
6533 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6538 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7]
6539 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6545 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
6551 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
6552 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
6554 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3]
6555 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
6559 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
6563 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
6565 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
6570 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
6572 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
6573 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
6576 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
6579 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6583 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6586 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6587 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6592 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6597 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
6598 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6601 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6602 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6604 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
6609 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6610 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
6615 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
6618 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
6619 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
6623 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
6624 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6628 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
6631 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
6632 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
6637 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
6641 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
6642 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
6645 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6646 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6648 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
6652 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6653 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
6657 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
6660 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
6661 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
6665 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
6666 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6670 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
6673 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
6674 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
6679 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
6683 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
6684 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
6687 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6688 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6690 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
6695 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6696 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
6700 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
6703 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
6704 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
6709 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
6710 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6714 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
6717 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
6718 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6721 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6726 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6727 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6730 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6731 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6733 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
6737 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6738 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
6743 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
6746 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
6747 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
6752 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6753 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6757 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
6760 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
6761 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6765 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6770 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6771 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6774 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6775 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6777 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
6781 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6782 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
6787 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6790 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
6791 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
6794 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
6795 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6799 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
6802 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
6803 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6807 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6812 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6813 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6816 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6817 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6818 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
6822 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6823 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
6827 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6830 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
6831 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
6834 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
6835 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6838 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
6841 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
6842 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
6845 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
6848 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6849 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
6852 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6853 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6859 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6861 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6863 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6869 ; AVX512DQ-FCP: # %bb.0:
6871 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
6872 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
6876 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
6880 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7]
6881 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
6885 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
6889 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
6890 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
6893 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
6896 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
6897 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
6899 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6900 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6904 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
6907 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
6908 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
6911 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
6914 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6915 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6923 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
6930 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
6931 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
6938 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
6941 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6942 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6949 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6955 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
6956 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6961 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6964 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6965 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6971 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
6976 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
6977 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6980 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
6982 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
6983 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
6986 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
6987 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7]
6990 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
6993 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
6994 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
6997 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
7000 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
7001 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
7006 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
7011 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
7012 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
7017 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
7020 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
7021 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
7026 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7]
7030 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
7031 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
7034 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
7037 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
7038 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
7044 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7046 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7048 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7054 ; AVX512BW: # %bb.0:
7055 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7056 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7057 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
7062 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
7066 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7069 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
7073 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
7075 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
7080 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7]
7081 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7086 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
7091 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
7092 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3]
7094 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
7095 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
7104 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
7109 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
7111 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
7115 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7120 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
7122 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
7123 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
7126 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
7129 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
7130 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7133 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7134 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7139 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
7144 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
7146 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
7150 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
7155 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7157 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
7158 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
7161 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7164 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7165 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7168 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7169 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7174 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
7179 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
7181 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
7185 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
7190 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7192 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
7193 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
7196 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7199 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7200 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7203 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7204 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7209 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
7214 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
7216 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
7220 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
7225 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
7227 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
7228 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
7231 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7234 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7235 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7238 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7239 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7244 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
7249 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
7251 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
7255 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7260 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
7262 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
7263 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
7266 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7269 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7270 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7273 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7274 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7279 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
7284 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
7286 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
7290 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7295 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
7297 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7]
7298 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
7301 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
7304 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
7305 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7308 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7309 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7313 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
7318 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
7320 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
7324 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
7329 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
7331 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
7332 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
7335 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7338 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7339 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
7342 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
7343 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7356 ; AVX512BW-FCP: # %bb.0:
7358 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
7365 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
7372 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
7373 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7]
7376 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
7379 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
7380 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
7382 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
7383 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
7384 ; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7387 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7]
7392 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
7393 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
7398 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
7401 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
7402 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
7408 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7]
7413 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7]
7414 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
7419 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3]
7422 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
7423 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
7428 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7]
7433 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
7434 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7]
7439 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3]
7442 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7443 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
7449 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
7454 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
7455 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
7459 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
7460 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
7461 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
7464 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
7465 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
7468 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
7471 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
7472 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
7475 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
7478 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
7479 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
7482 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
7485 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7]
7486 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
7489 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
7492 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
7493 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
7496 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
7499 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
7500 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
7503 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
7506 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
7507 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
7508 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7514 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7516 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7518 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7524 ; AVX512DQ-BW: # %bb.0:
7525 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7526 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7527 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
7532 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
7536 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7539 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
7543 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
7545 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
7550 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7]
7551 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7556 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
7561 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
7562 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3]
7564 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
7565 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
7574 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
7579 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
7581 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
7585 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7590 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
7592 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
7593 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
7596 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
7599 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
7600 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7603 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7604 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7609 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
7614 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
7616 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
7620 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
7625 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7627 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
7628 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
7631 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7634 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7635 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7638 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7639 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7644 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
7649 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
7651 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
7655 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
7660 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7662 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
7663 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
7666 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7669 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7670 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7673 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7674 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7679 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
7684 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
7686 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
7690 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
7695 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
7697 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
7698 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
7701 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7704 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7705 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7708 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7709 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7714 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
7719 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
7721 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
7725 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7730 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
7732 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
7733 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
7736 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7739 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7740 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7743 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7744 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7749 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
7754 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
7756 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
7760 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
7765 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
7767 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7]
7768 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
7771 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
7774 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
7775 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
7778 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7779 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
7783 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
7788 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
7790 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
7794 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
7799 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
7801 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
7802 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
7805 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7808 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7809 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
7812 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
7813 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7826 ; AVX512DQ-BW-FCP: # %bb.0:
7828 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
7835 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
7842 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
7843 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7]
7846 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
7849 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
7850 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
7852 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
7853 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
7854 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7857 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7]
7862 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
7863 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
7868 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
7871 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
7872 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
7878 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7]
7883 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7]
7884 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
7889 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3]
7892 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
7893 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
7898 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7]
7903 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
7904 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7]
7909 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3]
7912 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
7913 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
7919 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
7924 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
7925 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
7929 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
7930 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
7931 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
7934 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
7935 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
7938 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
7941 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
7942 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
7945 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
7948 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
7949 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
7952 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
7955 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7]
7956 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
7959 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
7962 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
7963 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
7966 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
7969 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
7970 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
7973 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
7976 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
7977 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
7978 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7984 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7986 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7988 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7993 %strided.vec0 = shufflevector <256 x i8> %wide.vec, <256 x i8> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248>
8014 ; SSE: # %bb.0:
8015 ; SSE-NEXT: subq $2040, %rsp # imm = 0x7F8
8017 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8019 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8021 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8027 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8029 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8032 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0]
8057 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8061 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
8062 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8093 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
8094 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8096 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8099 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8106 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8113 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8116 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8120 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8123 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8129 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
8130 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8132 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8135 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8139 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8142 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8149 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8152 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8156 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8159 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8165 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3]
8166 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8169 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8172 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8173 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
8174 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8175 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
8177 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8178 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535]
8182 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8185 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8186 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
8187 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8188 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
8189 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8195 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8198 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8199 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8200 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8201 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8203 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8204 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,3]
8205 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,65535]
8208 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8211 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8212 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
8213 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8214 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
8215 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8216 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,1,3]
8221 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
8223 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8226 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8227 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8228 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8229 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8231 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8232 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
8233 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0]
8236 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8239 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8240 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
8241 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8242 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
8243 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8244 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,1,3]
8248 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8251 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8252 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8253 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8254 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8256 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8257 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535]
8260 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8263 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8264 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
8265 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8266 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8267 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8274 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
8275 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8276 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8279 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8280 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
8281 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8282 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
8284 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8287 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8290 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8291 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
8292 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8293 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
8294 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8299 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8302 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8303 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
8304 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8305 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3]
8307 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8308 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,2,3]
8311 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8314 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8315 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
8316 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8317 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
8318 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8319 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,1,3]
8324 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8325 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8328 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8329 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8330 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8331 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8333 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8334 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
8337 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8340 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8341 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
8342 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8343 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
8344 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8345 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3]
8349 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8352 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8353 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8354 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8355 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8357 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8360 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8363 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8364 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
8365 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8366 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8367 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8374 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
8375 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8376 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8379 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8380 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
8381 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8382 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
8384 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8387 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8390 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8391 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
8392 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8393 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
8394 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8399 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8402 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8403 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
8404 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8405 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3]
8407 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,2,3]
8410 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8413 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8414 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
8415 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8416 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
8417 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8418 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3]
8423 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8424 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8427 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8428 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8429 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8430 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8432 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8433 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
8436 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8439 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8440 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
8441 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8442 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
8443 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8444 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3]
8451 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8452 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8453 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8454 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8456 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8459 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8462 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8463 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
8464 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8465 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8466 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8473 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
8474 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8475 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8478 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8479 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
8480 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8481 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
8485 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8488 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8489 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
8490 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8491 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
8492 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8497 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8500 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8501 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
8502 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8503 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
8505 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,3]
8508 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8511 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8512 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8513 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8514 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8515 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8516 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3]
8521 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8522 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8525 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8526 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8527 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8528 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8530 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8531 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
8534 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8537 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8538 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
8539 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8540 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
8541 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,1,3]
8545 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8549 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8550 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
8551 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8552 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8555 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8557 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
8558 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8559 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
8563 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
8564 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8571 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
8572 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8574 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8576 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8577 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
8578 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
8582 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8584 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8585 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
8590 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8592 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8593 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8594 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5]
8598 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8600 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8601 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8602 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
8607 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
8608 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8610 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8611 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8612 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5]
8616 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8618 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8619 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8620 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
8624 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8626 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8627 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3]
8628 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7]
8632 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8634 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8635 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3]
8642 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1]
8643 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8644 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8646 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8647 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
8648 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
8652 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8654 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8655 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
8660 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8662 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8663 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0]
8664 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5]
8668 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8670 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8671 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0]
8672 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
8677 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
8678 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8680 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8681 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0]
8682 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5]
8686 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8688 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8689 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0]
8690 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
8694 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8696 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8697 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
8698 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
8702 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8704 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8705 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
8712 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
8713 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8714 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8716 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8717 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
8718 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
8722 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8724 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8725 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
8730 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8732 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8733 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8734 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5]
8738 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8740 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8741 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8742 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
8747 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
8748 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8750 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8751 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8752 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5]
8756 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8758 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8759 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8760 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
8767 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
8768 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
8772 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8774 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8775 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
8782 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8783 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8784 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8786 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8787 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
8788 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
8792 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8794 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8795 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
8800 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8802 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8803 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8804 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5]
8808 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8810 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8811 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8812 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
8817 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
8818 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8820 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8821 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8822 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5]
8826 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8828 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8829 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0]
8830 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
8834 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8836 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8837 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
8838 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
8844 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8845 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
8852 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8853 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8854 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8858 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8864 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8865 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8871 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8873 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8874 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8879 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8880 ; SSE-NEXT: # xmm1 = mem[0,1,3,3]
8883 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8890 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8891 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8892 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8896 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8902 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8903 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8909 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8911 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8912 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8917 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8918 ; SSE-NEXT: # xmm1 = mem[0,1,3,3]
8921 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8928 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8929 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8930 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8934 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8941 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8947 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8949 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8950 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8955 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8956 ; SSE-NEXT: # xmm1 = mem[0,1,3,3]
8959 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8966 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8967 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8971 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8978 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8984 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8986 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8991 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,3,3]
8994 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9001 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9002 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9003 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9005 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7]
9009 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9011 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9012 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
9016 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9017 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9018 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9019 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
9023 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9024 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9025 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9026 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
9031 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9032 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9033 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9034 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9035 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
9039 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9040 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9041 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9042 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
9046 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9048 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9049 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
9053 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9055 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9056 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
9062 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9063 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9064 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9066 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7]
9070 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9072 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9073 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
9077 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9078 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9079 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9080 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
9084 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9085 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9086 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9087 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
9092 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9093 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9094 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9095 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9096 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
9100 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9101 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9102 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9103 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
9107 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9109 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9110 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
9114 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9116 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7]
9122 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9123 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9124 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9126 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9127 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
9131 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9133 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9134 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
9138 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9139 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9140 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9141 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
9145 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9146 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9147 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9148 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
9153 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9154 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9155 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9156 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9157 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
9161 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9162 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9163 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9164 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
9170 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9171 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
9175 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9178 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
9184 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9185 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9186 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9188 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9189 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
9193 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9195 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9196 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
9200 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9201 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9202 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9203 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
9207 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9208 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9209 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9210 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
9215 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9216 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9217 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9218 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9219 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
9223 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9224 ; SSE-NEXT: # xmm1 = mem[0,1,1,3]
9225 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9226 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
9230 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9232 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9233 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
9237 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9239 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9240 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
9246 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9247 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9248 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9249 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9251 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9252 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9254 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9256 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9263 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9264 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9267 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9268 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
9271 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9272 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9274 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9275 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
9280 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9281 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9282 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9285 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9286 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
9289 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9290 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9292 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9293 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
9297 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9298 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9300 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9301 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9303 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9305 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9314 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9315 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9316 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9317 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9319 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9320 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9322 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9324 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9331 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9332 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9335 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9336 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
9339 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9340 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9342 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9343 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
9348 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9349 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9350 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9353 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9354 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
9357 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9358 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9360 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9361 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
9365 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9366 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9368 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9369 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9371 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9381 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9382 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9383 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9384 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9386 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9387 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9389 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9391 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9398 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9399 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9402 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9403 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
9406 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9407 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9409 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9410 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
9415 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9416 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9417 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9420 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9421 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
9424 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9425 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9427 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9428 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
9432 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9433 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9435 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9436 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9438 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9440 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9449 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9450 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9451 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9452 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9454 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9455 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9457 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9459 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9466 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9467 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9470 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9471 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
9474 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9475 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9477 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9478 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
9483 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9484 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9485 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9488 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9489 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
9492 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9493 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9495 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,1,3]
9499 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9500 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9502 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9503 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9505 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9515 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
9516 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7]
9520 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9525 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9526 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
9530 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9531 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
9536 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9537 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9538 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
9542 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9543 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
9547 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9548 ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
9558 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
9559 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,3,1,4,5,6,7]
9563 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9568 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9569 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
9573 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9574 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
9579 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9580 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9581 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
9585 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9586 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
9590 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9591 ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
9595 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9602 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
9603 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9604 ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
9608 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9613 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9614 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
9618 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9619 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
9624 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9625 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9626 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
9630 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9631 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
9635 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9636 ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
9640 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9647 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1]
9648 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9649 ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
9653 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9658 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9659 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
9663 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9664 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
9669 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9670 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9671 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
9675 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9676 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7]
9680 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9681 ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
9692 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
9693 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9697 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9703 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9704 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9710 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9712 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9713 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9718 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,3,3]
9721 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9728 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
9729 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9733 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9739 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9740 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9746 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9748 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9749 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9754 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9755 ; SSE-NEXT: # xmm2 = mem[0,1,3,3]
9758 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9765 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
9766 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9770 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9776 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9777 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9783 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9785 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9790 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,3,3]
9793 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9800 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
9801 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9803 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9809 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9813 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9816 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
9817 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9821 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9823 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9824 ; SSE-NEXT: # xmm0 = mem[0,1,3,3]
9825 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9833 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
9834 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9836 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9838 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9840 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9842 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9844 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9846 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9848 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9850 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9852 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9854 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9856 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9858 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9860 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9862 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9864 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9866 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9868 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9870 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9872 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9874 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
9876 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9878 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9880 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9882 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
9887 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
9892 ; SSE-NEXT: addq $2040, %rsp # imm = 0x7F8
9896 ; AVX: # %bb.0:
9897 ; AVX-NEXT: subq $808, %rsp # imm = 0x328
9898 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
9900 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9904 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9905 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9906 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
9909 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9911 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9913 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
9914 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7]
9915 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
9918 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9920 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9922 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
9923 ; AVX-NEXT: vmovd {{.*#+}} xmm9 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9926 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9929 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9930 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9931 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
9932 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
9934 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9937 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9939 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
9941 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9944 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9946 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9947 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7]
9949 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9953 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9954 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9959 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9961 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
9962 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
9965 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
9966 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
9967 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9969 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9972 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9974 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9976 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9979 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9981 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
9982 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
9984 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9986 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9989 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
9991 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9993 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9996 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9997 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
9998 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10000 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10003 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10005 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
10007 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10010 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10012 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
10013 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
10015 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10018 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10020 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
10022 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10025 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10027 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
10028 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
10031 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10032 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10033 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10034 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
10035 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10038 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10039 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
10041 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10043 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10044 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7]
10045 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
10047 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10049 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
10050 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10053 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
10054 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7]
10055 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
10056 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10058 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10060 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10061 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10063 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10065 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
10066 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
10067 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10070 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
10073 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10075 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
10076 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3],xmm14[4,5,6,7]
10079 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
10080 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
10081 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10082 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10084 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10086 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
10087 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10089 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10091 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10092 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7]
10093 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10095 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10097 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10098 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10100 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10102 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
10103 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
10104 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
10105 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10107 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10109 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10110 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10112 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10114 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
10115 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7]
10116 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10118 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10120 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
10121 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10123 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10125 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
10126 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
10129 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10130 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10131 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10132 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
10133 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10135 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10137 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10138 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
10139 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10142 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10143 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
10144 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
10145 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10148 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
10149 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10150 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10152 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10154 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10155 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10156 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10159 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10162 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10163 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7]
10165 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10167 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10170 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10172 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10173 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7]
10176 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
10177 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10178 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10179 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10181 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10183 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10184 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10186 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10188 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10189 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
10190 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10192 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10194 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10195 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10197 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10199 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10200 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10201 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10202 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10204 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10206 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
10207 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10209 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10211 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
10212 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
10213 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10215 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10217 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10218 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10220 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10222 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
10223 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
10226 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10227 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10228 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10229 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
10230 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10232 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10234 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10235 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
10236 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10238 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10240 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10241 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
10242 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
10243 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10245 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10247 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
10248 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10250 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10252 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10253 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10254 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10255 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10257 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10259 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10260 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10262 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10264 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10265 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7]
10266 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10269 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10272 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10273 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7]
10276 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
10277 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10278 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10281 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10282 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10284 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10286 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10287 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
10288 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10290 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10292 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10293 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10295 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10297 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10298 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10299 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10300 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10302 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10304 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
10305 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10307 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10309 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
10310 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
10311 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10313 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10315 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10316 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10318 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10320 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
10321 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
10324 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10325 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10326 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10327 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
10329 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10331 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10332 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
10333 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10335 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10337 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10338 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
10339 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
10340 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10342 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10344 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
10345 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10346 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10348 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10350 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10351 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10352 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10353 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10355 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10357 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10358 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10360 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10362 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10363 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7]
10364 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10366 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10368 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10371 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10373 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10374 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7]
10377 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
10378 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10379 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10380 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10382 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10384 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10387 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10388 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
10391 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10392 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10394 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10396 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10397 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10398 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10399 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10401 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10403 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
10404 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10406 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10408 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
10409 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
10410 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10412 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10414 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10415 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10417 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10419 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
10420 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
10423 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10424 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10425 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10426 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
10427 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10429 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10431 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10432 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
10433 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10435 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10437 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10438 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
10439 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
10441 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10443 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
10444 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10445 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10448 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10449 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10450 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10451 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10453 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10455 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10456 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10458 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10460 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10461 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7]
10462 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10464 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10466 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10469 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10471 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10472 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7]
10475 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
10476 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10477 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10478 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10480 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10482 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10483 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10485 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10487 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10488 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
10489 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10491 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10493 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10496 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10497 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10498 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10501 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
10503 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10505 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
10506 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
10507 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10509 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10511 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10512 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10514 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10516 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
10517 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
10520 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10521 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10522 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10523 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
10524 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10526 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10528 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10529 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
10530 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10532 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10534 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10535 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
10536 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
10537 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10539 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10541 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
10542 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10543 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10545 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10547 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10548 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10549 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10550 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10552 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10554 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10555 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10557 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10559 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10560 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7]
10561 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10563 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10565 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10568 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10570 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10571 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7]
10574 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
10575 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10576 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10577 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10579 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10581 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10582 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10584 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10586 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10587 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
10588 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10590 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10592 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10593 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10595 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10597 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10598 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10599 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
10600 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10602 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10604 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
10605 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10608 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
10609 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
10612 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10615 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
10616 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
10619 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10620 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10621 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
10622 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10625 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10626 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
10628 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10630 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
10631 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm0[6,7]
10632 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
10633 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10635 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10637 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
10638 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10639 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10641 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10643 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10644 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7]
10645 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
10646 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10648 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10650 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
10651 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10653 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10655 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10656 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5],xmm6[6,7]
10657 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10659 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10661 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10664 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10666 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
10667 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7]
10670 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
10671 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
10672 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10674 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10676 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
10677 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10679 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10681 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
10682 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5],xmm6[6,7]
10683 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10685 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10687 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
10688 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10690 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10692 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
10693 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7]
10694 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7]
10695 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10697 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10699 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
10700 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10702 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10704 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
10705 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
10708 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
10711 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
10712 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
10715 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10716 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
10717 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10719 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10721 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10723 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10725 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10727 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10729 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10731 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10733 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10735 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10737 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
10738 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10740 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10742 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
10744 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10746 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
10749 ; AVX-NEXT: addq $808, %rsp # imm = 0x328
10754 ; AVX2: # %bb.0:
10755 ; AVX2-NEXT: subq $840, %rsp # imm = 0x348
10757 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10758 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
10762 ; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10763 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10765 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
10767 ; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10769 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10771 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
10772 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
10774 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10775 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
10779 ; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10780 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
10782 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
10784 ; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10787 ; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10788 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
10789 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
10790 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
10792 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10795 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10797 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10799 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10802 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10804 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
10807 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
10809 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10812 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10814 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
10817 ; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10819 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10821 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
10824 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
10825 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
10826 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10827 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10830 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10832 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10834 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10836 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10839 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10841 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10842 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
10844 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10846 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10849 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10851 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10853 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10856 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
10857 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
10858 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
10860 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10863 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10865 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
10867 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10872 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
10875 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
10877 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10880 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10882 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
10884 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10887 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10889 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
10892 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
10893 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10894 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10895 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10897 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10900 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10903 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10905 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10906 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3]
10908 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10911 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
10915 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
10916 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
10917 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
10918 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10920 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10922 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10923 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10925 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10927 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
10930 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
10931 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10933 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10935 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
10937 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10939 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
10942 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7]
10943 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
10944 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
10945 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10947 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10949 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
10950 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10952 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10954 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10955 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
10956 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10958 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10960 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10961 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10963 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10965 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
10966 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
10967 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
10968 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10970 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10972 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
10973 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10977 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10980 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
10981 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10983 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10985 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
10986 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10988 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10990 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
10993 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
10994 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
10995 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10996 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10998 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11000 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11002 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11004 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11007 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11008 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
11010 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11012 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11014 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
11016 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11018 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11020 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11021 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11022 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11025 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11028 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11031 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
11032 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11034 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11036 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11037 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11039 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11041 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11044 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
11045 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
11046 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11047 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11048 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11051 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11053 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11055 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11056 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
11057 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11059 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11061 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11062 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11064 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11066 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11067 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11068 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11069 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11071 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11073 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
11074 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11078 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11081 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11082 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11084 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11086 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
11087 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11089 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11091 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
11094 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
11095 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11096 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11097 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11099 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11101 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11103 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11105 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11107 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11109 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11110 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
11113 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11115 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
11117 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11119 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11121 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11122 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11123 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11124 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11126 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11128 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11129 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11131 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11133 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11136 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
11139 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11140 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11142 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11144 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11147 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
11148 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
11149 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11150 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11151 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11153 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11155 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11156 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11159 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11160 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
11161 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11164 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11165 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11167 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11169 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11170 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11171 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11172 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11174 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11176 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
11177 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11181 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11184 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11185 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11187 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11189 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
11190 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11192 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11194 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
11197 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
11198 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11199 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11200 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11203 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11205 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11207 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11209 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11211 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11212 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
11214 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11217 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
11220 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11222 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11223 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11224 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11225 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11227 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11229 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11230 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11232 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11234 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11237 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
11238 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11240 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11242 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11245 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11248 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
11249 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
11250 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11251 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11252 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11254 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11256 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11257 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11259 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11261 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11262 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
11264 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11266 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11267 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11269 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11271 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11272 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11273 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11274 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11276 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11278 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
11279 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11283 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11286 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11287 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11289 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11291 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
11292 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11294 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11296 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
11299 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
11300 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11301 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11302 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11304 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11306 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11308 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11310 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11312 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11314 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11315 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
11317 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11319 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11321 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
11323 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11325 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11327 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11328 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11329 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11330 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11332 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11334 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11335 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11337 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11339 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11342 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
11343 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11345 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11347 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11348 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11350 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11352 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11355 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
11356 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
11357 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11358 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11360 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11362 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11363 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11365 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11367 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11368 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
11369 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11371 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11373 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11376 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11377 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11378 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11381 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
11384 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11387 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11388 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11390 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11392 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
11393 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11395 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11397 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
11400 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
11401 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11402 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11403 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11405 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11407 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11409 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11411 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11413 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11415 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11416 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
11418 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11420 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11422 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
11424 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11426 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11428 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11429 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11430 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11431 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11433 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11435 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11436 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11438 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11440 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11443 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
11444 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11446 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11448 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11449 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11451 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11453 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11456 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
11457 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
11458 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11459 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11460 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11462 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11464 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11465 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11467 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11469 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11470 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
11471 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11473 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11475 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11476 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11478 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11480 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11481 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11482 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11483 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11485 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11487 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
11488 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11492 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11495 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11499 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
11502 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
11505 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
11506 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11507 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11508 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11510 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11513 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11516 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11518 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
11519 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3]
11521 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11523 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11525 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
11527 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11530 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11531 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3]
11532 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
11533 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11535 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11537 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11538 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11540 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11542 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11545 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7]
11546 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11548 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11550 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11551 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11553 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11555 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
11558 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
11559 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
11560 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
11561 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11563 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11565 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
11566 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11568 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11570 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
11571 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3]
11572 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11574 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11576 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
11577 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11579 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11581 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11582 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3]
11583 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3]
11584 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11586 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11588 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
11589 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11593 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
11596 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11599 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
11602 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
11605 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
11606 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
11607 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
11608 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11610 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11612 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11614 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11616 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11618 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11620 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11622 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11624 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11626 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11628 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
11629 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11631 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11633 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
11634 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11636 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11638 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
11641 ; AVX2-NEXT: addq $840, %rsp # imm = 0x348
11646 ; AVX2-FP: # %bb.0:
11647 ; AVX2-FP-NEXT: subq $840, %rsp # imm = 0x348
11649 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11650 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
11654 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11655 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11657 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
11659 ; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11661 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11663 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11664 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
11666 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11667 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
11671 ; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11672 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
11674 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
11676 ; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11679 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11680 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
11681 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11682 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11684 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11687 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11689 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11691 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11694 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11696 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
11699 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
11701 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11704 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11706 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
11709 ; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11711 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11713 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
11716 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
11717 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
11718 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11719 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11722 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11724 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11726 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11728 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11731 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11733 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11734 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
11736 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11738 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11741 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11743 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11745 ; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11748 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
11749 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11750 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11752 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11755 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11757 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
11759 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11764 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11767 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11769 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11772 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11774 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
11776 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11779 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11781 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
11784 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
11785 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11786 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11787 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11789 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11792 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11795 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11797 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
11798 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3]
11800 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11803 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
11807 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
11808 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
11809 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
11810 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11812 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11814 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11815 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11817 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11819 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
11822 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
11823 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11825 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11827 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
11829 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11831 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
11834 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7]
11835 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
11836 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
11837 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11839 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11841 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
11842 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11844 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11846 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11847 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
11848 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11850 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11852 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11853 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11855 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11857 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
11858 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
11859 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
11860 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11862 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11864 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
11865 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11869 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11872 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
11873 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11875 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11877 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
11878 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11880 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11882 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
11885 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
11886 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
11887 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11888 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11890 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11892 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11894 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11896 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11899 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11900 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
11902 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11904 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11906 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
11908 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11910 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11912 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11913 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11914 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11917 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11920 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11923 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
11924 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11926 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11928 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11929 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11931 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11933 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
11936 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
11937 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
11938 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11939 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11940 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11943 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11945 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11947 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11948 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
11949 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11951 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11953 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11954 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11956 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11958 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
11959 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
11960 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
11961 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11963 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11965 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
11966 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11970 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11973 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11974 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11976 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11978 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
11979 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11981 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11983 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
11986 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
11987 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11988 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11989 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11991 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11993 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11995 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11997 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11999 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12001 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
12002 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
12005 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12007 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
12009 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12011 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12013 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12014 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
12015 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12016 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12018 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12020 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12021 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12023 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12025 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12028 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
12031 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12032 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12034 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12036 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
12039 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
12040 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
12041 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
12042 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12043 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12045 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12047 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
12048 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12051 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12052 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
12053 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
12056 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12057 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12059 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12061 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12062 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
12063 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12064 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12066 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12068 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
12069 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12073 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
12076 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12077 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12079 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12081 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
12082 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12084 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12086 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12089 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
12090 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
12091 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
12092 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12095 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12097 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12099 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12101 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12103 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
12104 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
12106 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12109 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
12112 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12114 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12115 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
12116 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12117 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12119 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12121 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12122 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12124 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12126 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12129 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
12130 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12132 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12134 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12137 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
12140 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
12141 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
12142 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
12143 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12144 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12146 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12148 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
12149 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12151 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12153 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12154 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
12156 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12158 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12159 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12161 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
12163 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12164 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
12165 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12166 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12168 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
12170 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
12171 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12175 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
12178 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12179 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12181 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12183 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
12184 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12186 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12188 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12191 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
12192 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
12193 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
12194 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12196 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12198 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12200 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12202 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12204 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12206 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
12207 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
12209 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12211 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12213 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
12215 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12217 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12219 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12220 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
12221 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12222 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12224 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12226 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12227 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12229 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12231 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12234 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
12235 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12237 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12239 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12240 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12242 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12244 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
12247 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
12248 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
12249 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
12250 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12252 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12254 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
12255 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12257 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12259 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12260 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
12261 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12263 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12265 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12268 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12269 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
12270 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12273 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
12276 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
12279 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12280 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12282 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12284 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
12285 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
12287 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12289 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12292 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
12293 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
12294 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
12295 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12297 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12299 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12301 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12303 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
12305 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12307 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
12308 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
12310 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12312 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12314 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
12316 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12318 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12320 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12321 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
12322 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12323 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12325 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12327 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12328 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12330 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12332 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12335 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
12336 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12338 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12340 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12341 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12343 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12345 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
12348 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6,7]
12349 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
12350 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
12351 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12352 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12354 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12356 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
12357 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12359 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12361 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12362 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
12363 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12365 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12367 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12368 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12370 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12372 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12373 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
12374 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12375 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12377 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12379 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
12380 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12384 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
12387 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12391 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
12394 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12397 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
12398 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
12399 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
12400 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12402 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12405 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12408 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12410 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
12411 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3]
12413 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
12415 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12417 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
12419 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12422 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
12423 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3]
12424 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
12425 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12427 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12429 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
12430 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12432 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12434 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
12437 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7]
12438 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12440 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12442 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
12443 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12445 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12447 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
12450 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
12451 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
12452 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
12453 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12455 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12457 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
12458 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12460 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12462 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
12463 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3]
12464 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12466 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12468 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
12469 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12471 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12473 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
12474 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3]
12475 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3]
12476 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12478 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12480 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
12481 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12485 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
12488 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
12491 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12494 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
12497 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
12498 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12499 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
12500 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12502 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12504 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12506 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12508 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12510 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12512 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12514 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12516 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12518 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12520 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12521 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12523 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12525 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12526 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12528 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12530 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12533 ; AVX2-FP-NEXT: addq $840, %rsp # imm = 0x348
12538 ; AVX2-FCP: # %bb.0:
12539 ; AVX2-FCP-NEXT: subq $1096, %rsp # imm = 0x448
12541 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12543 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12545 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12547 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12548 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
12550 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12552 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12554 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12556 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12557 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
12559 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12562 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12563 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12565 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12567 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12568 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
12570 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12571 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
12573 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12574 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12578 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12580 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
12582 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12584 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12586 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
12588 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12589 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
12591 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
12592 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
12593 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
12594 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12596 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12598 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12600 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12605 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12607 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12609 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12611 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12613 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12616 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
12617 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
12619 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12621 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12623 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12626 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12628 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3]
12630 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12632 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12635 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12636 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
12637 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12639 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3]
12640 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
12641 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12642 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12645 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12647 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12650 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12652 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
12653 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12654 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12658 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
12659 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12661 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12663 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
12664 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
12665 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12666 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12667 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12670 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12672 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12673 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12675 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12677 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
12678 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12679 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12681 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12683 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
12685 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12687 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
12688 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
12689 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12690 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12691 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12693 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12695 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12701 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
12702 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12708 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
12712 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
12715 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
12716 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
12717 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12718 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12722 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12725 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
12726 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12728 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12730 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
12731 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12733 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12735 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
12736 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
12737 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12738 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12740 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12743 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12745 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12747 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12750 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12753 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
12754 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12756 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12762 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3]
12764 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12766 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
12767 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12769 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3]
12770 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
12771 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12772 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12774 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12776 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12777 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12779 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12781 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
12782 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12783 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12786 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
12789 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
12790 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12791 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12792 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12794 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
12795 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12796 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
12797 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12798 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12800 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12802 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
12803 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
12804 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12805 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
12806 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12807 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12809 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12811 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
12812 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm0[6,7]
12814 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
12815 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12816 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12817 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12818 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12820 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12822 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm8[0,1,2],xmm7[3]
12823 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
12824 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12825 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
12826 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12827 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
12829 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
12831 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
12832 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
12833 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm9[4,5,6,7]
12834 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12835 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
12837 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
12838 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12840 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,6],ymm11[7]
12841 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
12842 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12843 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12844 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12847 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5],ymm15[6,7]
12848 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
12849 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
12850 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12852 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
12853 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12855 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm2[3]
12856 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
12857 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12858 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
12862 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3]
12863 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3]
12864 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12865 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12867 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12870 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12872 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12874 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12877 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12879 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
12880 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12881 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12883 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12885 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
12886 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12888 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12890 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
12891 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
12892 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12893 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12894 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12896 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12898 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12899 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12901 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12903 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
12904 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12905 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12907 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12909 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
12910 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12914 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
12915 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
12916 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12917 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12923 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12929 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
12930 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12932 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12934 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
12936 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12938 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
12939 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
12940 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12941 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12943 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12945 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12947 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12949 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
12950 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12951 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12953 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12955 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
12956 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12960 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
12961 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
12962 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12963 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12965 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12968 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12970 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12972 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12976 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
12977 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12978 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12981 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
12982 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12985 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
12986 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
12987 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12988 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12991 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
12992 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12995 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
12996 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
12999 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
13002 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
13003 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
13004 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
13005 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13007 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13009 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13011 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13013 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13015 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13017 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13019 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13021 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13023 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13025 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13026 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13028 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13030 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13031 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13033 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13035 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13038 ; AVX2-FCP-NEXT: addq $1096, %rsp # imm = 0x448
13043 ; AVX512: # %bb.0:
13044 ; AVX512-NEXT: subq $520, %rsp # imm = 0x208
13049 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
13053 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
13056 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
13060 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
13062 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
13067 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
13068 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
13074 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13076 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13081 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
13082 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
13084 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
13091 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13094 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13096 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
13102 ; AVX512-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13103 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
13108 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
13113 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
13114 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
13116 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13119 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13121 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
13128 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
13129 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
13132 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13133 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13134 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
13135 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13136 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
13141 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
13142 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
13146 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13149 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13152 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
13156 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
13159 ; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
13163 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
13166 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
13167 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
13173 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
13174 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13178 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
13179 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
13182 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
13187 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13189 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
13193 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
13196 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7]
13203 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13204 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
13206 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13208 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13211 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
13214 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
13215 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13216 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13218 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13220 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13225 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
13226 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
13229 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13230 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13231 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
13232 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13233 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
13235 ; AVX512-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13238 ; AVX512-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13240 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13241 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
13246 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
13249 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13250 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
13255 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
13256 ; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
13261 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
13264 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7]
13265 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
13267 ; AVX512-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13271 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
13272 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13276 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13277 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
13280 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
13283 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13285 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13287 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
13290 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13292 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13295 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
13300 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
13301 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
13303 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13305 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
13308 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
13309 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
13310 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13312 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13314 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
13320 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
13321 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
13324 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13325 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13326 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
13327 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13328 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
13331 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13332 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
13334 ; AVX512-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13338 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13341 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13342 ; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
13347 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
13348 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
13353 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13356 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
13357 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
13360 ; AVX512-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13362 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
13366 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13367 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
13370 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
13373 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13377 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
13380 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13382 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13385 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
13388 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13390 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
13391 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13393 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13395 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
13398 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13399 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
13400 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13402 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13404 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
13408 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
13409 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
13412 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13413 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13414 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
13415 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13416 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
13417 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13419 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13421 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13422 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
13425 ; AVX512-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13427 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13430 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13431 ; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
13436 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
13437 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
13442 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13445 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
13446 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
13447 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
13450 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
13451 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
13454 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13455 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
13458 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
13461 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
13465 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
13468 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13470 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13473 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
13476 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13478 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
13479 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13481 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13483 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
13486 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13487 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
13488 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
13492 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
13497 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
13498 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
13502 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13503 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13504 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
13505 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13506 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
13507 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13509 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13511 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13512 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
13513 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13516 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13519 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13520 ; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
13525 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
13526 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
13530 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13533 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
13534 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
13535 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13537 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13539 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
13540 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13544 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13545 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
13549 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
13556 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
13559 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13561 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13564 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
13567 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
13569 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
13570 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13575 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
13578 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13579 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
13584 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
13589 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
13590 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
13594 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13595 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13596 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
13597 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13598 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
13599 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13602 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13604 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13605 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
13606 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13608 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13610 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13613 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13614 ; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
13619 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
13620 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
13625 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13628 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
13629 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
13630 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13632 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
13634 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
13635 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
13639 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13640 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
13643 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
13648 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
13650 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
13655 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13658 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
13663 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
13668 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
13671 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13672 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
13677 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
13682 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
13683 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
13686 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13687 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13688 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24
13689 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
13692 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13694 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13695 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
13696 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13698 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13700 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13703 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13704 ; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
13709 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
13710 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
13715 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
13718 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
13719 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
13721 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13723 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
13727 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
13728 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
13731 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
13736 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13738 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
13743 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
13746 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
13750 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
13753 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
13756 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13757 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
13760 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
13764 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
13765 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
13768 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13769 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13770 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
13771 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13773 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13775 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13777 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13779 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13781 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13782 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13784 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13786 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13788 ; AVX512-NEXT: addq $520, %rsp # imm = 0x208
13793 ; AVX512-FCP: # %bb.0:
13796 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
13797 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
13799 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13803 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
13805 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13808 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13809 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
13815 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
13817 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13821 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13822 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13824 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13826 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
13830 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13832 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
13835 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
13837 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
13843 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13851 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13858 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
13859 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13867 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3]
13870 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
13871 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
13872 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
13873 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13881 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
13889 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
13890 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
13897 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
13900 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
13908 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
13914 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
13915 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
13920 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
13923 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
13924 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
13925 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
13926 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13933 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
13939 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
13940 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
13946 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
13949 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
13956 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
13961 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
13962 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
13965 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
13968 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
13969 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
13970 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21
13976 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
13984 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
13985 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
13991 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
13994 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
14000 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
14003 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
14004 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
14007 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
14010 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14011 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14012 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18
14014 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
14015 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14017 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
14018 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14020 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14022 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14025 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
14026 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14029 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14030 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14032 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload
14033 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14035 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload
14036 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14038 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
14041 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
14044 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
14049 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14055 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
14056 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
14061 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
14064 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
14065 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14066 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16
14073 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
14082 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
14083 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
14089 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
14092 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
14099 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
14106 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
14107 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
14111 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
14114 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
14115 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
14116 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19
14123 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
14131 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
14132 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
14138 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
14141 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
14147 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
14152 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
14153 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
14157 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
14160 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
14161 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
14162 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17
14168 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
14175 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
14176 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
14181 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
14184 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
14189 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
14192 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
14193 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
14196 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
14199 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
14200 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14201 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1
14202 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14204 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14209 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14211 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14213 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14220 ; AVX512DQ: # %bb.0:
14221 ; AVX512DQ-NEXT: subq $520, %rsp # imm = 0x208
14226 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
14230 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
14233 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
14237 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
14239 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
14244 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
14245 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
14251 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14253 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14258 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
14259 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
14261 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
14268 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14271 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14273 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
14279 ; AVX512DQ-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14280 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
14285 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
14290 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
14291 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
14293 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14296 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14298 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
14305 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
14306 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
14309 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14310 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14311 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
14312 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14313 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
14318 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14319 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
14323 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14326 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14329 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
14333 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
14336 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
14340 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
14343 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
14344 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
14350 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
14351 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14355 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
14356 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
14359 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
14364 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14366 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
14370 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
14373 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7]
14380 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14381 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
14383 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14385 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14388 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14391 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
14392 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14393 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14395 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14397 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14402 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
14403 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
14406 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14407 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14408 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
14409 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14410 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
14412 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14415 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14417 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14418 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
14423 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
14426 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14427 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
14432 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
14433 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
14438 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
14441 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7]
14442 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
14444 ; AVX512DQ-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14448 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
14449 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14453 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14454 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
14457 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
14460 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14462 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
14464 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
14467 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14469 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14472 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
14477 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
14478 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
14480 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14482 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
14485 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
14486 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
14487 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14489 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14491 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
14497 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14498 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
14501 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14502 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14503 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
14504 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14505 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
14508 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14509 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
14511 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14515 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14518 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14519 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
14524 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
14525 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
14530 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14533 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
14534 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
14537 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14539 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
14543 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14544 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
14547 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
14550 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14554 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
14557 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14559 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14562 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
14565 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14567 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
14568 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14570 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14572 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
14575 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14576 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
14577 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14579 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14581 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
14585 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14586 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
14589 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14590 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14591 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
14592 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14593 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
14594 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14596 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14598 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14599 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
14602 ; AVX512DQ-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14604 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14607 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14608 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
14613 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
14614 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
14619 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14622 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
14623 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
14624 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
14627 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
14628 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
14631 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14632 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
14635 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
14638 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
14642 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
14645 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14647 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14650 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
14653 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14655 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
14656 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14658 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
14660 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
14663 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14664 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
14665 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
14669 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
14674 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14675 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
14679 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14680 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14681 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
14682 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14683 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
14684 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14686 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14688 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14689 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
14690 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14693 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14696 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14697 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
14702 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
14703 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
14707 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14710 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
14711 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
14712 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14714 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14716 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
14717 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14721 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14722 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
14726 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
14733 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
14736 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14738 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14741 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
14744 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
14746 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
14747 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14752 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
14755 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14756 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
14761 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
14766 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14767 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
14771 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14772 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14773 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
14774 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14775 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
14776 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14779 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14781 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14782 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
14783 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14785 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14787 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14790 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14791 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
14796 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
14797 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
14802 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14805 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
14806 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
14807 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14809 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
14811 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
14812 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
14816 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14817 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
14820 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
14825 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
14827 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
14832 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14835 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
14840 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
14845 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
14848 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14849 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
14854 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
14859 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14860 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
14863 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14864 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14865 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24
14866 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
14869 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14871 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14872 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
14873 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14875 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14877 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14880 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14881 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
14886 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
14887 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
14892 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
14895 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
14896 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
14898 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14900 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
14904 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
14905 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
14908 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
14913 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14915 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
14920 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
14923 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
14927 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
14930 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
14933 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14934 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
14937 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
14941 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14942 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
14945 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14946 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14947 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
14948 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14950 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14952 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14954 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14956 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14958 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
14959 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14961 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
14963 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
14965 ; AVX512DQ-NEXT: addq $520, %rsp # imm = 0x208
14970 ; AVX512DQ-FCP: # %bb.0:
14973 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14974 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
14976 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14980 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14982 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14985 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14986 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14992 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
14994 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14998 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14999 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15001 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15003 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
15007 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15009 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
15012 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
15014 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
15020 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15028 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15035 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
15036 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7]
15044 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3]
15047 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
15048 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
15049 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
15050 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15058 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
15066 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
15067 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
15074 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
15077 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
15085 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
15091 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
15092 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
15097 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
15100 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
15101 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
15102 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
15103 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15110 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
15116 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
15117 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
15123 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
15126 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
15133 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
15138 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
15139 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
15142 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
15145 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
15146 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
15147 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21
15153 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
15161 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
15162 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
15168 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
15171 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
15177 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
15180 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
15181 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
15184 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
15187 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15188 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
15189 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18
15191 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
15192 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
15194 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
15195 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
15197 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15199 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
15202 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
15203 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
15206 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15207 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15209 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload
15210 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
15212 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload
15213 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
15215 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
15218 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
15221 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
15226 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15232 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
15233 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
15238 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
15241 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
15242 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
15243 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16
15250 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
15259 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
15260 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
15266 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
15269 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
15276 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
15283 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
15284 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
15288 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
15291 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
15292 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15293 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19
15300 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
15308 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
15309 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
15315 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
15318 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
15324 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
15329 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
15330 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
15334 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
15337 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
15338 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15339 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17
15345 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
15352 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
15353 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
15358 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
15361 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
15366 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
15369 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
15370 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
15373 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
15376 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
15377 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15378 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1
15379 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15381 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15386 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15388 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15390 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15397 ; AVX512BW: # %bb.0:
15398 ; AVX512BW-NEXT: subq $744, %rsp # imm = 0x2E8
15406 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
15410 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
15413 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
15417 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
15419 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
15424 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
15425 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
15428 ; AVX512BW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15431 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
15434 ; AVX512BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15437 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
15438 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
15441 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
15450 ; AVX512BW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15451 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
15455 ; AVX512BW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15458 ; AVX512BW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15459 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3]
15461 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7]
15466 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7]
15467 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7]
15472 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3]
15474 ; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15477 ; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15479 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3]
15480 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
15482 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
15483 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15484 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
15485 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15487 ; AVX512BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15497 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
15503 ; AVX512BW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15505 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
15507 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15511 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
15516 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
15518 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15519 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15522 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
15526 ; AVX512BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15527 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
15528 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
15532 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
15538 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
15541 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
15543 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15546 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
15549 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3]
15551 ; AVX512BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15553 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15556 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3]
15559 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
15560 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
15562 ; AVX512BW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15564 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
15565 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15567 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
15569 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
15570 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
15573 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
15574 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15575 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
15576 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15581 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
15585 ; AVX512BW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15587 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
15590 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
15596 ; AVX512BW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15597 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
15602 ; AVX512BW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15604 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3]
15607 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
15608 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
15609 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
15612 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
15613 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
15616 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
15617 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
15619 ; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15622 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
15627 ; AVX512BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15628 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
15630 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
15631 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
15633 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
15635 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
15638 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
15639 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
15641 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
15643 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
15644 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
15646 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
15648 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
15651 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
15652 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
15655 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
15658 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
15659 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
15663 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
15664 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15665 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
15666 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15668 ; AVX512BW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15671 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15674 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
15676 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
15679 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
15684 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3]
15688 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3]
15691 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
15692 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
15696 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
15698 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
15700 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
15701 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3]
15704 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3]
15709 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
15714 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
15717 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
15722 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
15725 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
15728 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
15729 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15730 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15734 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
15736 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
15738 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
15739 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
15743 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
15744 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15745 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
15746 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15751 ; AVX512BW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15752 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
15754 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
15756 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
15758 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
15761 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
15764 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload
15766 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
15770 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
15772 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3]
15775 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7]
15776 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15781 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
15782 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
15786 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
15787 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
15788 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
15791 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
15794 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
15796 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
15798 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
15801 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
15804 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
15807 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
15809 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
15811 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
15814 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
15815 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
15818 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
15821 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
15822 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
15824 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15827 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
15828 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15829 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
15830 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15832 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15835 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15839 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
15842 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
15847 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
15851 ; AVX512BW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15853 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
15856 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
15857 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
15861 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
15862 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15865 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
15866 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
15869 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
15875 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
15878 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
15881 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
15884 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
15885 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
15888 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
15891 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
15892 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15893 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
15896 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
15897 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
15900 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
15901 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
15904 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
15905 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15906 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
15907 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15909 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
15911 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
15913 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15917 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
15920 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
15923 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15925 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
15929 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
15932 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
15933 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15936 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
15937 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
15939 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
15941 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
15942 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
15945 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
15950 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
15953 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
15956 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
15960 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
15962 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
15964 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
15967 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
15968 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
15971 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
15975 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
15976 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
15977 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
15980 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
15981 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15982 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
15986 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
15990 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
15993 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
15998 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
16000 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
16002 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
16004 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
16007 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
16008 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
16011 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
16014 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
16015 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
16018 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
16023 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
16026 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
16029 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
16032 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
16035 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
16038 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
16039 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
16042 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
16045 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
16046 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
16049 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
16050 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
16051 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1
16052 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16054 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16056 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16058 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16060 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16062 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
16063 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16065 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
16067 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
16069 ; AVX512BW-NEXT: addq $744, %rsp # imm = 0x2E8
16074 ; AVX512BW-FCP: # %bb.0:
16075 ; AVX512BW-FCP-NEXT: subq $264, %rsp # imm = 0x108
16077 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16078 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
16080 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16083 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16085 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16088 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
16089 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16093 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16097 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
16098 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
16101 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16105 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16107 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
16109 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
16115 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16119 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16122 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7]
16129 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
16130 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
16137 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3]
16140 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
16141 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
16142 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
16143 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16148 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
16153 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
16154 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
16159 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
16162 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
16167 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
16170 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
16171 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
16174 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
16177 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
16178 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
16179 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
16180 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16185 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
16190 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
16191 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
16196 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
16199 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
16204 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
16207 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
16208 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
16211 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
16214 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
16215 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
16216 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
16217 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16222 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
16227 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
16228 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
16233 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
16236 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
16241 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
16244 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
16245 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
16248 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
16251 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
16252 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16253 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
16256 ; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
16257 ; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
16258 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16260 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16262 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
16265 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16267 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16269 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
16270 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7]
16274 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16276 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
16278 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
16281 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
16284 ; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
16286 ; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload
16288 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
16293 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
16294 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
16299 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
16302 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
16303 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
16304 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6
16309 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
16314 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
16315 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
16320 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
16323 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
16328 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7]
16331 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
16332 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
16335 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3]
16338 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
16339 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
16340 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21
16345 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
16350 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
16351 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
16356 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
16359 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
16364 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7]
16367 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
16368 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
16371 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
16374 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
16375 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
16376 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9
16381 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
16385 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
16386 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16389 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
16392 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
16397 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
16400 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
16401 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
16404 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
16407 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
16408 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
16409 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
16410 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16412 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16414 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16419 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16421 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16423 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16425 ; AVX512BW-FCP-NEXT: addq $264, %rsp # imm = 0x108
16430 ; AVX512DQ-BW: # %bb.0:
16431 ; AVX512DQ-BW-NEXT: subq $744, %rsp # imm = 0x2E8
16439 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
16443 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
16446 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
16450 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
16452 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
16457 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
16458 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
16461 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16464 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
16467 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16470 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
16471 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
16474 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
16483 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16484 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
16488 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16491 ; AVX512DQ-BW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16492 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3]
16494 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7]
16499 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7]
16500 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7]
16505 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3]
16507 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16510 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16512 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3]
16513 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
16515 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
16516 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16517 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
16518 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16520 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16530 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
16536 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16538 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
16540 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
16544 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
16549 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
16551 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
16552 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
16555 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
16559 ; AVX512DQ-BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16560 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
16561 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
16565 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
16571 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
16574 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
16576 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16579 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
16582 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3]
16584 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16586 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16589 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3]
16592 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
16593 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
16595 ; AVX512DQ-BW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16597 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
16598 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
16600 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
16602 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
16603 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
16606 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
16607 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16608 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
16609 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16614 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
16618 ; AVX512DQ-BW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16620 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
16623 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
16629 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16630 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
16635 ; AVX512DQ-BW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16637 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3]
16640 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
16641 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
16642 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
16645 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
16646 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
16649 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
16650 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
16652 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16655 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
16660 ; AVX512DQ-BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16661 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
16663 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
16664 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
16666 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
16668 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
16671 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
16672 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
16674 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
16676 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
16677 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
16679 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
16681 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
16684 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
16685 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
16688 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
16691 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
16692 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
16696 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
16697 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16698 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
16699 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16701 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16704 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
16707 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
16709 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
16712 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
16717 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3]
16721 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3]
16724 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
16725 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
16729 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
16731 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
16733 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
16734 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3]
16737 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3]
16742 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
16747 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
16750 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
16755 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
16758 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
16761 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
16762 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
16763 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
16767 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
16769 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
16771 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
16772 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
16776 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
16777 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16778 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16779 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16784 ; AVX512DQ-BW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16785 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
16787 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
16789 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
16791 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
16794 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
16797 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload
16799 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
16803 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
16805 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3]
16808 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7]
16809 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
16814 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
16815 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
16819 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
16820 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
16821 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16824 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
16827 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
16829 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
16831 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
16834 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
16837 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
16840 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
16842 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
16844 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
16847 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
16848 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
16851 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
16854 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
16855 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
16857 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16860 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
16861 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16862 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
16863 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16865 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
16868 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
16872 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
16875 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
16880 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
16884 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16886 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
16889 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
16890 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
16894 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
16895 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
16898 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
16899 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
16902 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
16908 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
16911 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
16914 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
16917 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
16918 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
16921 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
16924 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
16925 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
16926 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
16929 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
16930 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
16933 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
16934 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
16937 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
16938 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16939 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16940 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16942 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
16944 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
16946 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
16950 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
16953 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
16956 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
16958 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
16962 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
16965 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
16966 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
16969 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
16970 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
16972 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
16974 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
16975 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
16978 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
16983 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
16986 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
16989 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
16993 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
16995 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
16997 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
17000 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
17001 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
17004 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
17008 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
17009 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
17010 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
17013 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
17014 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17015 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
17019 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
17023 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
17026 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
17031 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
17033 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
17035 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
17037 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
17040 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
17041 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
17044 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
17047 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
17048 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
17051 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
17056 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
17059 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
17062 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
17065 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
17068 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
17071 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
17072 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
17075 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
17078 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
17079 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
17082 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
17083 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
17084 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1
17085 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17087 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17089 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17091 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17093 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17095 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
17096 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17098 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
17100 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
17102 ; AVX512DQ-BW-NEXT: addq $744, %rsp # imm = 0x2E8
17107 ; AVX512DQ-BW-FCP: # %bb.0:
17108 ; AVX512DQ-BW-FCP-NEXT: subq $264, %rsp # imm = 0x108
17110 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17111 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
17113 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17116 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17118 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17121 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
17122 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17126 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17130 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
17131 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
17134 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17138 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17140 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
17142 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
17148 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17152 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17155 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7]
17162 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
17163 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
17170 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3]
17173 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
17174 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
17175 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
17176 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17181 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
17186 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
17187 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
17192 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
17195 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
17200 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
17203 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
17204 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
17207 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
17210 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
17211 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
17212 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
17213 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17218 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
17223 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
17224 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
17229 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
17232 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
17237 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
17240 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
17241 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
17244 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
17247 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
17248 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
17249 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
17250 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17255 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
17260 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
17261 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
17266 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
17269 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
17274 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
17277 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
17278 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
17281 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
17284 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
17285 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
17286 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
17289 ; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
17290 ; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
17291 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17293 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17295 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
17298 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17300 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17302 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
17303 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7]
17307 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17309 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
17311 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
17314 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
17317 ; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
17319 ; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload
17321 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
17326 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
17327 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
17332 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
17335 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
17336 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
17337 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6
17342 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
17347 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
17348 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
17353 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
17356 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
17361 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7]
17364 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
17365 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
17368 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3]
17371 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
17372 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
17373 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21
17378 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
17383 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
17384 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
17389 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
17392 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
17397 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7]
17400 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
17401 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
17404 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
17407 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
17408 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
17409 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9
17414 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
17418 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
17419 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
17422 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
17425 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
17430 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
17433 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
17434 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
17437 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
17440 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
17441 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
17442 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
17443 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17445 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17447 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17452 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17454 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17456 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17458 ; AVX512DQ-BW-FCP-NEXT: addq $264, %rsp # imm = 0x108
17462 %strided.vec0 = shufflevector <512 x i8> %wide.vec, <512 x i8> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>