Lines Matching +full:0 +full:x2e8
20 ; SSE: # %bb.0:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
28 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
29 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
30 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
31 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
32 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
33 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
34 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
35 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
36 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,2,0]
37 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,7,5]
38 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
40 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
41 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3]
42 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
43 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
49 ; AVX: # %bb.0:
50 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
51 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
52 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
57 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
58 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
59 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
60 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
61 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
62 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
64 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
65 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
75 ; AVX2: # %bb.0:
76 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
77 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
78 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
85 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
88 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
89 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
90 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
91 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
97 ; AVX2-FP: # %bb.0:
98 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
99 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
100 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
107 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
110 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
111 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
112 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
113 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
119 ; AVX2-FCP: # %bb.0:
120 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
121 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
122 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
129 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
132 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
133 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
134 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
135 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
141 ; AVX512: # %bb.0:
142 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
143 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
144 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
151 ; AVX512-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
154 ; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
155 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
156 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
157 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
163 ; AVX512-FCP: # %bb.0:
164 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
165 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
166 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
173 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
176 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
177 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
178 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
179 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
185 ; AVX512DQ: # %bb.0:
186 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
187 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
188 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
195 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
198 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
199 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
200 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
201 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
207 ; AVX512DQ-FCP: # %bb.0:
208 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
209 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
210 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
217 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
220 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
221 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
222 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
223 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
229 ; AVX512BW: # %bb.0:
230 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
231 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
232 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
239 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
242 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
243 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
250 ; AVX512BW-FCP: # %bb.0:
251 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
252 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
253 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
260 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
263 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
264 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
271 ; AVX512DQ-BW: # %bb.0:
272 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
273 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
274 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
281 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
284 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
285 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
292 ; AVX512DQ-BW-FCP: # %bb.0:
293 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
294 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
295 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
302 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
305 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
306 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
319 %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
320 %2 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
321 %3 = shufflevector <2 x i16> %in.vec4, <2 x i16> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
322 %4 = shufflevector <2 x i16> %in.vec6, <2 x i16> %in.vec7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
323 %5 = shufflevector <4 x i16> %1, <4 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
324 %6 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
325 %7 = shufflevector <8 x i16> %5, <8 x i16> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
326 %interleaved.vec = shufflevector <16 x i16> %7, <16 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
333 ; SSE: # %bb.0:
334 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
335 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
336 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
337 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
338 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
339 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
340 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
341 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
342 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
343 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
344 ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
345 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
346 ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
347 ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
348 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
350 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
351 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0]
352 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0]
354 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
356 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
359 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
360 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3]
366 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
369 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
370 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
378 ; AVX: # %bb.0:
379 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
380 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
381 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
382 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
383 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
384 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
385 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm0[0]
386 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
387 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
388 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
389 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm3[0],xmm0[0]
390 ; AVX-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
391 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero
392 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm6[0],xmm5[0]
393 ; AVX-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero
394 ; AVX-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero
395 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0]
397 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,3,1,4,5,6,7]
399 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,1,3,1,4,5,6,7]
400 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
405 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
406 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4,5,6,7]
407 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7]
408 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,2,0,4,5,6,7]
409 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
410 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[2,0,2,3,4,5,6,7]
411 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
412 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
413 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
416 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm1[1,2,3],xmm11[4],xmm1[5,6,7]
417 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7]
419 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7]
420 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7]
423 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
424 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
425 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
426 ; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
427 ; AVX-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
428 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
429 ; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
430 ; AVX-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1]
433 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
434 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
435 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
443 ; AVX2: # %bb.0:
444 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
445 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
446 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
447 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
448 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
449 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
450 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
451 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
452 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
453 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
454 ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
455 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
456 ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
457 ; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
458 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
461 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
462 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
463 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
464 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
465 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
466 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
467 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,2,2,3,4,6,6,7]
468 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
469 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
470 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,2,2,3,4,6,6,7]
471 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
472 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
473 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
474 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
475 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
476 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7]
477 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
478 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
480 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
482 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
483 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
484 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
491 ; AVX2-FP: # %bb.0:
492 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
493 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
494 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
495 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
496 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
497 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
498 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
499 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
500 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
501 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
502 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
503 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
504 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
505 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
506 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
509 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,8,9,0,1,8,9,16,17,18,19,20,21,22,23,18,19,26,27,18,19,26,27]
511 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
513 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
514 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,8,9,10,11,12,13,14,15,18,19,26,27,18,19,26,27,24,25,26,27,28,29,30,31]
516 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
518 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7]
519 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
520 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,4,5,12,13,4,5,12,13,16,17,18,19,20,21,22,23,22,23,30,31,22,23,30,31]
523 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
527 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
528 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
535 ; AVX2-FCP: # %bb.0:
536 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
537 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
538 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
539 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
540 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
541 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
542 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
543 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
544 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
545 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
546 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
547 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
548 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
549 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
550 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
553 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
554 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1]
556 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15]
559 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
561 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
563 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
568 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
575 ; AVX512: # %bb.0:
576 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
577 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
578 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
579 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
580 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
581 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
582 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
583 ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
584 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
585 ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
586 ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
587 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
588 ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
589 ; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
590 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
593 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,1,3,4,5,5,7]
594 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
595 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
596 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,1,3,4,5,5,7]
597 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
598 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
600 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
601 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
603 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
604 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
605 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
606 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
607 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
608 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
609 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
610 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
611 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
612 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
613 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
614 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
615 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
616 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
623 ; AVX512-FCP: # %bb.0:
624 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
625 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
626 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
627 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
628 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
629 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
630 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
631 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
632 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
633 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
634 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
635 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
636 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
637 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
638 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
642 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
644 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15]
647 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
649 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
650 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
651 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
656 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
663 ; AVX512DQ: # %bb.0:
664 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
665 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
666 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
667 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
668 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
669 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
670 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
671 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
672 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
673 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
674 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
675 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
676 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
677 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
678 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
681 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,1,3,4,5,5,7]
682 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
683 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
684 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,1,3,4,5,5,7]
685 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
686 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
688 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
689 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
691 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
692 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
693 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
694 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
695 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
696 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
697 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
698 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
699 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
700 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
701 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
702 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
703 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
704 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
711 ; AVX512DQ-FCP: # %bb.0:
712 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
713 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
714 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
715 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
716 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
717 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
718 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
719 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
720 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
721 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
722 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
723 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
724 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
725 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
726 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
730 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
732 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15]
735 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
737 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
738 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
739 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
744 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
751 ; AVX512BW: # %bb.0:
752 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
753 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
754 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
755 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
756 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
757 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
758 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
759 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
760 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
761 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
762 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
763 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
764 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
765 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
766 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
769 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
776 ; AVX512BW-FCP: # %bb.0:
777 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
778 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
779 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
780 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
781 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
782 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
783 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
784 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
785 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
786 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
787 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
788 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
789 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
790 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
791 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
794 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
801 ; AVX512DQ-BW: # %bb.0:
802 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
803 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
804 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
805 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
806 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
807 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
808 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
809 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
810 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
811 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
812 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
813 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
814 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
815 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
816 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
819 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
826 ; AVX512DQ-BW-FCP: # %bb.0:
827 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
828 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
829 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
830 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
831 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
832 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
833 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
834 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
835 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
836 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
837 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
838 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
839 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
840 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
841 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
844 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
857 %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
858 %2 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
859 %3 = shufflevector <4 x i16> %in.vec4, <4 x i16> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
860 %4 = shufflevector <4 x i16> %in.vec6, <4 x i16> %in.vec7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
861 %5 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
862 %6 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
863 %7 = shufflevector <16 x i16> %5, <16 x i16> %6, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
864 %interleaved.vec = shufflevector <32 x i16> %7, <32 x i16> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
871 ; SSE: # %bb.0:
872 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
873 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
883 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
885 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
887 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1]
889 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
890 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0]
892 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
893 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0]
895 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
897 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1]
900 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
901 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
904 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
907 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3]
911 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1]
915 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
917 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,0,0]
919 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,0,0]
921 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1]
923 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
926 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
927 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2,3]
930 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
933 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,3]
937 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
938 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
950 ; AVX: # %bb.0:
951 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
952 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
953 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
962 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
963 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
965 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7]
966 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
968 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
969 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
970 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
971 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,0,0]
972 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[0,1,0,1]
973 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5],xmm14[6,7]
975 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5],ymm13[6,7]
978 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7]
983 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
985 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
989 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7]
990 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
994 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
995 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,0,0]
996 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1]
997 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5],xmm6[6,7]
999 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
1002 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
1007 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7]
1009 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
1018 ; AVX2: # %bb.0:
1019 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1020 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
1021 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
1030 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2]
1031 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
1033 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
1034 ; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506]
1036 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7]
1037 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1038 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
1040 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1041 ; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0]
1043 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
1044 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
1047 ; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542]
1049 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7]
1052 ; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0]
1054 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
1055 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
1060 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
1065 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
1066 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
1069 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
1072 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1073 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7]
1082 ; AVX2-FP: # %bb.0:
1083 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1084 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1085 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1094 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2]
1095 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
1097 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
1098 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506]
1100 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7]
1101 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1102 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
1104 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1105 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0]
1107 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
1108 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
1111 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542]
1113 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7]
1116 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0]
1118 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
1119 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
1124 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
1129 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
1130 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
1133 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
1136 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1137 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7]
1146 ; AVX2-FCP: # %bb.0:
1147 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1148 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1149 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1158 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2]
1159 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
1161 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
1162 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506]
1164 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7]
1165 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1166 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
1168 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1169 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0]
1171 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
1172 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
1175 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542]
1177 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7]
1180 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0]
1182 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
1183 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
1188 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
1193 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
1194 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
1197 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
1200 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1201 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7]
1210 ; AVX512: # %bb.0:
1211 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1212 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
1213 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
1222 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
1225 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
1226 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542]
1228 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7]
1229 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1232 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1233 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0]
1235 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
1236 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7]
1237 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
1239 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506]
1241 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7]
1242 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
1244 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0]
1246 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
1247 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
1253 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
1258 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
1259 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
1262 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
1265 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1266 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
1274 ; AVX512-FCP: # %bb.0:
1275 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1276 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1277 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1286 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
1289 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
1290 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542]
1292 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7]
1293 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1296 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1297 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0]
1299 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
1300 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7]
1301 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
1303 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506]
1305 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7]
1306 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
1308 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0]
1310 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
1311 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
1317 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
1322 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
1323 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
1326 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
1329 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1330 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
1338 ; AVX512DQ: # %bb.0:
1339 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1340 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
1341 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
1350 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
1353 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
1354 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542]
1356 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7]
1357 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1360 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1361 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0]
1363 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
1364 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7]
1365 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
1367 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506]
1369 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7]
1370 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
1372 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0]
1374 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
1375 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
1381 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
1386 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
1387 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
1390 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
1393 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1394 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
1402 ; AVX512DQ-FCP: # %bb.0:
1403 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1404 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1405 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1414 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
1417 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
1418 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542]
1420 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7]
1421 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1424 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1425 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0]
1427 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
1428 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7]
1429 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
1431 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506]
1433 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7]
1434 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
1436 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0]
1438 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
1439 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
1445 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
1450 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
1451 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
1454 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
1457 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
1458 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
1466 ; AVX512BW: # %bb.0:
1467 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1468 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1469 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1480 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59]
1490 ; AVX512BW-FCP: # %bb.0:
1491 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1492 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1493 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1504 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59]
1514 ; AVX512DQ-BW: # %bb.0:
1515 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1516 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1517 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1528 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59]
1538 ; AVX512DQ-BW-FCP: # %bb.0:
1539 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1540 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1541 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1552 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59]
1568 %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1569 %2 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1570 %3 = shufflevector <8 x i16> %in.vec4, <8 x i16> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1571 %4 = shufflevector <8 x i16> %in.vec6, <8 x i16> %in.vec7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1572 %5 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1573 %6 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1574 %7 = shufflevector <32 x i16> %5, <32 x i16> %6, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1575 %interleaved.vec = shufflevector <64 x i16> %7, <64 x i16> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
1582 ; SSE: # %bb.0:
1584 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1585 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
1596 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1598 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
1600 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1602 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1604 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
1607 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1609 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
1611 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1612 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3]
1613 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1614 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
1615 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,0,0]
1618 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
1619 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
1620 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1624 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1627 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
1628 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1631 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1634 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1635 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1636 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1637 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0]
1638 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0]
1641 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
1642 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1643 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1645 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1647 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1650 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1652 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1654 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1658 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3]
1662 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3]
1664 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
1665 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1667 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0]
1668 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
1671 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
1672 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1673 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1676 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1678 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1682 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1685 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
1687 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
1688 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3]
1689 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0]
1690 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,0,0]
1693 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1694 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
1695 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1698 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
1700 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1702 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3]
1706 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1]
1707 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1710 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
1712 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1714 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,3]
1718 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1]
1719 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1722 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
1725 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3]
1729 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1]
1730 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1733 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1736 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
1740 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1]
1741 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1752 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1756 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1758 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1760 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1762 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1768 ; AVX: # %bb.0:
1770 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1771 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
1774 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1776 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1777 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
1778 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
1782 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
1783 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1]
1784 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero
1786 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7]
1789 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
1794 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1795 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1]
1797 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7]
1799 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1800 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7]
1801 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1807 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
1815 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
1817 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7]
1818 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1824 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1827 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6],ymm3[7]
1829 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1834 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1838 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7]
1839 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7]
1840 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1841 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
1844 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1845 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1846 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1849 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7]
1851 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
1857 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
1861 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
1862 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7]
1863 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1870 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6],ymm7[7]
1879 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7]
1880 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7]
1881 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1882 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1884 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
1885 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1887 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7]
1890 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1]
1892 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
1893 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1894 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
1895 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,1]
1897 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1]
1898 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero
1900 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
1903 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1]
1905 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7]
1906 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
1907 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1908 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,0,0,0]
1909 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1]
1912 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
1913 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
1915 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
1916 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
1919 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1920 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,0,1,1]
1922 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
1923 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
1924 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1929 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1931 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1933 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1935 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1942 ; AVX2: # %bb.0:
1943 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1944 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
1949 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
1952 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1953 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1954 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
1957 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
1960 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1961 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
1962 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
1963 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
1964 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1970 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
1971 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
1972 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7]
1973 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1981 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
1984 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
1985 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
1986 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
1987 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
1988 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1989 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
1990 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7]
1991 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11]
1992 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11]
1994 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
1995 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
1996 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
1999 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5],ymm14[6,7]
2004 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
2008 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
2009 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
2010 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
2011 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,2,3]
2013 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7]
2014 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
2015 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[4],ymm15[4],ymm3[5],ymm15[5]
2016 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
2018 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
2019 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2026 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2028 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2034 ; AVX2-FP: # %bb.0:
2035 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2036 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2041 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
2044 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
2045 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
2046 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
2049 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
2052 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
2053 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
2054 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
2055 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
2056 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2062 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
2063 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
2064 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7]
2065 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2073 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
2076 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
2077 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
2078 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
2079 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
2080 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
2081 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
2082 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7]
2083 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11]
2084 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11]
2086 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
2087 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
2088 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2091 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5],ymm14[6,7]
2096 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
2100 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
2101 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
2102 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
2103 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,2,3]
2105 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7]
2106 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
2107 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[4],ymm15[4],ymm3[5],ymm15[5]
2108 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
2110 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
2111 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2118 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2120 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2126 ; AVX2-FCP: # %bb.0:
2128 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2129 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2132 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
2133 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1]
2137 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
2138 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,1]
2140 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7]
2143 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
2144 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0]
2148 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
2149 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,1,1,0]
2151 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7]
2152 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
2153 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2154 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,2,2,2,0,0,3,3]
2157 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
2158 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [2,2,3,3,3,3,0,0]
2161 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
2162 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2163 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2169 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
2176 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2,3,4],ymm2[5],ymm11[6,7]
2178 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7]
2179 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2181 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,1,1]
2184 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1]
2186 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7]
2187 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,1,1,1,1,0,0]
2190 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
2191 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7]
2192 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2193 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11]
2194 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
2196 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11]
2199 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7]
2202 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
2205 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[8],ymm6[8],ymm14[9],ymm6[9],ymm14[10],ymm6[10],ymm14[11],ymm6[11]
2208 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
2209 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7]
2212 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7]
2216 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7]
2222 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
2223 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7]
2224 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5]
2226 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7]
2228 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
2233 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2234 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2237 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
2240 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
2241 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
2242 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2247 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2249 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2251 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2253 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2260 ; AVX512: # %bb.0:
2261 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2262 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
2278 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
2286 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
2291 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
2292 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11]
2293 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
2299 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
2301 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
2313 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2316 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2318 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
2319 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2322 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2331 ; AVX512-FCP: # %bb.0:
2332 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2333 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2349 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
2357 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
2362 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
2363 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11]
2364 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
2370 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
2372 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
2384 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2387 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2389 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
2390 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2393 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2402 ; AVX512DQ: # %bb.0:
2403 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2404 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
2420 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
2428 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
2433 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
2434 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11]
2435 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
2441 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
2443 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
2455 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2458 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2460 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
2461 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2464 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2473 ; AVX512DQ-FCP: # %bb.0:
2474 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2475 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2491 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
2499 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
2504 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
2505 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11]
2506 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
2512 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
2514 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
2526 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2529 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2531 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
2532 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2535 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2544 ; AVX512BW: # %bb.0:
2545 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2546 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2547 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
2556 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51]
2558 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0]
2563 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55]
2565 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0]
2568 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59]
2570 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0]
2573 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63]
2575 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0]
2586 ; AVX512BW-FCP: # %bb.0:
2587 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2588 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2589 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2598 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51]
2600 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0]
2605 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55]
2607 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0]
2610 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59]
2612 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0]
2615 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63]
2617 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0]
2628 ; AVX512DQ-BW: # %bb.0:
2629 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2630 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2631 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
2640 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51]
2642 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0]
2647 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55]
2649 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0]
2652 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59]
2654 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0]
2657 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63]
2659 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0]
2670 ; AVX512DQ-BW-FCP: # %bb.0:
2671 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2672 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2673 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2682 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51]
2684 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0]
2689 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55]
2691 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0]
2694 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59]
2696 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0]
2699 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63]
2701 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0]
2718 %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2719 %2 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2720 %3 = shufflevector <16 x i16> %in.vec4, <16 x i16> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2721 %4 = shufflevector <16 x i16> %in.vec6, <16 x i16> %in.vec7, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2722 %5 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2723 %6 = shufflevector <32 x i16> %3, <32 x i16> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2724 %7 = shufflevector <64 x i16> %5, <64 x i16> %6, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
2725 %interleaved.vec = shufflevector <128 x i16> %7, <128 x i16> poison, <128 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 96, i32 112, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 97, i32 113, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 98, i32 114, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 99, i32 115, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 100, i32 116, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 101, i32 117, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 102, i32 118, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 103, i32 119, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 104, i32 120, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 105, i32 121, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 106, i32 122, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 107, i32 123, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 108, i32 124, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 109, i32 125, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 110, i32 126, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95, i32 111, i32 127>
2732 ; SSE: # %bb.0:
2733 ; SSE-NEXT: subq $264, %rsp # imm = 0x108
2734 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2735 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
2745 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
2747 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
2749 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2751 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
2752 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,0,0]
2754 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
2755 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,0,0]
2757 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
2758 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2760 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1]
2763 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
2764 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,3]
2765 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2771 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
2772 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2777 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
2779 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3]
2780 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2785 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
2787 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,0,0]
2789 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,0,0]
2791 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1]
2792 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2794 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
2797 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
2798 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3]
2799 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2805 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1]
2806 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2809 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
2810 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3]
2811 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2813 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
2814 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0]
2816 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
2817 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,0,0]
2823 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2827 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
2829 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
2830 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
2831 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2834 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
2836 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1]
2837 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3]
2838 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2844 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1]
2845 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2848 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1]
2849 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3]
2850 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2855 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
2856 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
2859 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2860 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
2861 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2864 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
2866 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2867 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
2868 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2874 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
2875 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2878 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2879 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
2884 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
2888 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
2889 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0]
2890 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0]
2896 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2900 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
2902 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
2903 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
2904 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2907 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
2909 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
2910 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3]
2911 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2917 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1]
2918 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2921 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
2922 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3]
2923 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2928 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
2929 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0]
2932 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2933 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
2934 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2937 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2939 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
2940 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
2941 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2947 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1]
2950 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
2951 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3]
2954 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2956 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2959 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2961 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2962 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0]
2963 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,0,0]
2968 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
2972 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
2974 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
2975 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm7[0],xmm11[1]
2978 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
2980 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
2981 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3]
2987 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
2990 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2991 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3]
2994 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2996 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2998 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
2999 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
3002 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
3003 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3006 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
3008 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
3009 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
3015 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
3018 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3019 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
3020 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3031 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3033 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3035 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3037 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3039 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3041 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3045 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3047 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3049 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3051 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3053 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3055 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3057 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3059 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3061 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3063 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3065 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3067 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3069 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3071 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3073 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3075 ; SSE-NEXT: addq $264, %rsp # imm = 0x108
3079 ; AVX: # %bb.0:
3080 ; AVX-NEXT: subq $296, %rsp # imm = 0x128
3081 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
3082 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
3084 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3086 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3087 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3091 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3093 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3094 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3097 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
3099 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3101 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3102 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3107 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3109 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3110 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
3115 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7]
3117 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
3118 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3119 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,0,0]
3120 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
3122 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
3123 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
3126 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
3129 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
3131 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
3132 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
3133 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3135 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0]
3136 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1]
3140 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1]
3141 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero
3143 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
3152 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1]
3154 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7]
3155 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
3156 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3161 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
3168 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
3169 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
3170 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3171 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3172 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
3173 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1]
3175 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
3176 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
3177 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero
3179 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
3180 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
3181 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
3184 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
3186 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
3187 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
3188 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3193 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
3200 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
3202 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
3203 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3206 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0]
3207 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1]
3212 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1]
3213 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero
3215 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
3224 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1]
3226 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
3227 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
3228 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3233 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
3240 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
3241 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
3242 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3243 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3244 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
3245 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1]
3247 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3248 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
3249 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero
3251 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
3252 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
3253 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
3256 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
3258 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
3259 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
3265 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
3272 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
3273 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3277 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0]
3278 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1]
3283 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,0,1]
3284 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm13[0],zero,xmm13[1],zero
3286 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
3295 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1]
3297 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
3298 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7]
3303 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5,6],ymm8[7]
3310 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
3311 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3312 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
3313 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
3314 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
3316 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
3317 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1]
3318 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero
3320 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7]
3321 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
3322 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
3325 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1]
3327 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
3328 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7]
3333 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
3340 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
3341 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
3342 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3343 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
3345 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
3346 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
3348 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3349 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
3351 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
3352 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero
3354 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
3355 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3356 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
3358 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3359 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
3363 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1]
3365 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
3366 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
3371 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
3378 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
3379 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
3380 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
3390 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3392 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3394 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3396 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3398 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3400 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3402 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3404 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3406 ; AVX-NEXT: addq $296, %rsp # imm = 0x128
3411 ; AVX2: # %bb.0:
3413 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
3414 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
3416 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3421 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3423 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3425 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3426 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
3428 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3430 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3432 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3433 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
3436 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
3438 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
3439 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
3440 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3442 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3444 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3445 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
3446 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
3447 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
3448 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3451 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
3452 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,0,2,1]
3459 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3460 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
3461 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
3462 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3465 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,0,2,1]
3466 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3467 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3468 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3469 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
3470 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
3471 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
3472 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
3473 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3474 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
3475 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
3476 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
3477 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
3478 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3482 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3483 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
3484 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3485 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3488 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
3491 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3492 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
3495 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3496 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
3498 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3499 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
3501 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
3502 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
3503 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
3507 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3509 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
3510 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
3511 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3514 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
3515 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
3521 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3523 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7]
3524 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3527 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,2,2,3]
3529 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3530 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3531 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
3532 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11]
3533 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11]
3534 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5]
3535 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
3536 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11]
3537 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
3539 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
3545 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3547 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
3551 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[4],ymm3[4],ymm6[5],ymm3[5]
3552 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
3559 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3561 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7]
3564 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,2,2,3]
3566 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3567 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
3568 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
3569 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
3570 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
3571 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
3572 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
3573 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
3575 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
3578 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3580 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
3581 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
3588 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3590 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3592 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3596 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3598 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3600 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3602 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3604 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3606 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3613 ; AVX2-FP: # %bb.0:
3615 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3616 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3618 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3623 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3625 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3627 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3628 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
3630 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3632 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3634 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3635 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
3638 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
3640 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
3641 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
3642 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3644 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3646 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3647 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
3648 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
3649 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
3650 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3653 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
3654 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,0,2,1]
3661 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3662 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
3663 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
3664 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3667 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,0,2,1]
3668 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3669 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3670 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3671 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
3672 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
3673 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
3674 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
3675 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3676 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
3677 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
3678 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
3679 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
3680 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3684 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3685 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
3686 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3687 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3690 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
3693 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3694 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
3697 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3698 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
3700 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3701 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
3703 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
3704 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
3705 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
3709 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3711 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
3712 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
3713 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3716 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
3717 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
3723 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3725 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7]
3726 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3729 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,2,2,3]
3731 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3732 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3733 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
3734 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11]
3735 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11]
3736 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5]
3737 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
3738 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11]
3739 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
3741 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
3747 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3749 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
3753 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[4],ymm3[4],ymm6[5],ymm3[5]
3754 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
3761 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3763 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7]
3766 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,2,2,3]
3768 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3769 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
3770 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
3771 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
3772 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
3773 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
3774 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
3775 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
3777 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
3780 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3782 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
3783 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3790 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3792 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3794 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3798 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3800 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3802 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3804 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3806 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3808 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3815 ; AVX2-FCP: # %bb.0:
3816 ; AVX2-FCP-NEXT: subq $296, %rsp # imm = 0x128
3817 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3818 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3820 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3822 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3823 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3824 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,2,2,2,0,0,3,3]
3827 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3830 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3832 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7]
3834 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3836 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3837 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
3838 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,2,3,3,3,3,0,0]
3841 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3843 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3844 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
3846 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
3847 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7]
3848 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3849 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1]
3852 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,0,1]
3854 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7]
3855 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0]
3858 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,1,1,0]
3860 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
3861 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
3862 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3871 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7]
3880 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
3881 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7]
3882 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3883 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,2,2,0,0,3,3]
3886 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
3887 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,3,3,3,3,0,0]
3890 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
3891 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
3892 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3893 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
3894 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
3895 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1]
3897 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,0,1]
3899 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
3900 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
3901 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
3902 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,1,1,1,1,0,0]
3904 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,1,1,0]
3906 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
3907 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
3908 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3911 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
3915 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
3916 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3917 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3918 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3919 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3922 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
3926 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
3927 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3928 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
3930 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3931 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
3936 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
3938 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
3939 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3944 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7]
3947 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
3948 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
3949 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3951 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5]
3954 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7]
3956 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
3967 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
3968 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7]
3970 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3974 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7]
3979 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7]
3980 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7]
3981 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3982 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
3983 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
3984 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5]
3986 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7]
3988 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7]
3989 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
3990 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11]
3995 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
3996 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7]
3997 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3998 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4002 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7]
4005 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7]
4006 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
4007 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4016 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7]
4025 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7]
4026 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7]
4027 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7]
4031 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
4036 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
4037 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7]
4038 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11]
4039 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
4040 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5]
4042 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7]
4044 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
4045 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
4046 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11]
4051 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
4052 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
4055 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7]
4059 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
4060 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
4061 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4066 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4068 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4070 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4074 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4076 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4078 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4080 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4082 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4084 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4086 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4088 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4090 ; AVX2-FCP-NEXT: addq $296, %rsp # imm = 0x128
4095 ; AVX512: # %bb.0:
4096 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
4097 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
4099 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4101 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4104 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4107 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
4109 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
4110 ; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888
4119 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
4121 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
4122 ; AVX512-NEXT: movw $8738, %r11w # imm = 0x2222
4127 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11]
4130 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11]
4131 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
4133 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
4138 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
4143 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11]
4160 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
4162 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
4164 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4167 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
4181 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11]
4183 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4193 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11]
4195 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11]
4201 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4202 ; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4203 ; AVX512-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4204 ; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4205 ; AVX512-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
4208 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
4209 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
4222 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
4235 ; AVX512-FCP: # %bb.0:
4236 ; AVX512-FCP-NEXT: subq $520, %rsp # imm = 0x208
4237 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4238 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4245 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4252 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4259 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4268 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4272 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
4276 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
4280 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
4287 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4290 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4293 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4294 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
4295 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4298 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4300 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4303 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4305 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4308 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4312 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4316 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4320 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4324 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4328 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4331 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4332 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4334 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4336 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4338 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4345 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
4348 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
4349 ; AVX512-FCP-NEXT: movw $-30584, %ax # imm = 0x8888
4351 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15]
4353 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15]
4359 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
4361 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
4379 ; AVX512-FCP-NEXT: movw $8738, %ax # imm = 0x2222
4392 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11]
4393 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
4394 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 {%k1} # 64-byte Folded Reload
4395 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11]
4402 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0]
4404 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0]
4405 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 {%k2} # 64-byte Folded Reload
4406 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0]
4423 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4432 ; AVX512-FCP-NEXT: addq $520, %rsp # imm = 0x208
4437 ; AVX512DQ: # %bb.0:
4438 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
4439 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
4441 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4443 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4446 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4449 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
4451 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
4452 ; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888
4461 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
4463 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
4464 ; AVX512DQ-NEXT: movw $8738, %r11w # imm = 0x2222
4469 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11]
4472 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11]
4473 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
4475 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
4480 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
4485 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11]
4502 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
4504 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
4506 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4509 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
4523 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11]
4525 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4535 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11]
4537 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11]
4543 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4544 ; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4545 ; AVX512DQ-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4546 ; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4547 ; AVX512DQ-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
4550 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
4551 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
4564 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
4577 ; AVX512DQ-FCP: # %bb.0:
4578 ; AVX512DQ-FCP-NEXT: subq $520, %rsp # imm = 0x208
4579 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4580 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4587 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4594 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4601 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4610 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4614 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
4618 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
4622 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
4629 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4632 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4635 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4636 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
4637 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4640 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4642 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4645 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4647 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4650 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4654 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4658 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4662 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4666 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4670 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4673 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4674 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4676 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4678 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4680 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4687 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
4690 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
4691 ; AVX512DQ-FCP-NEXT: movw $-30584, %ax # imm = 0x8888
4693 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15]
4695 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15]
4701 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
4703 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
4721 ; AVX512DQ-FCP-NEXT: movw $8738, %ax # imm = 0x2222
4734 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11]
4735 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
4736 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 {%k1} # 64-byte Folded Reload
4737 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11]
4744 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0]
4746 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0]
4747 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 {%k2} # 64-byte Folded Reload
4748 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0]
4765 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4774 ; AVX512DQ-FCP-NEXT: addq $520, %rsp # imm = 0x208
4779 ; AVX512BW: # %bb.0:
4780 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4781 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
4782 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
4791 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35]
4793 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0]
4795 ; AVX512BW-NEXT: movw $-30584, %cx # imm = 0x8888
4798 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0]
4800 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0]
4802 ; AVX512BW-NEXT: movw $8738, %cx # imm = 0x2222
4808 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39]
4810 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0]
4813 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0]
4815 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0]
4819 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43]
4821 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0]
4824 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0]
4826 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0]
4830 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47]
4832 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0]
4835 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0]
4837 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0]
4841 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51]
4843 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0]
4846 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0]
4848 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0]
4852 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55]
4854 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0]
4857 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0]
4859 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0]
4863 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59]
4865 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0]
4868 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0]
4870 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0]
4874 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63]
4876 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0]
4879 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0]
4881 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0]
4897 ; AVX512BW-FCP: # %bb.0:
4898 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4899 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4900 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
4909 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35]
4911 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0]
4913 ; AVX512BW-FCP-NEXT: movw $-30584, %cx # imm = 0x8888
4916 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0]
4918 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0]
4920 ; AVX512BW-FCP-NEXT: movw $8738, %cx # imm = 0x2222
4926 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39]
4928 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0]
4931 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0]
4933 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0]
4937 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43]
4939 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0]
4942 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0]
4944 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0]
4948 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47]
4950 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0]
4953 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0]
4955 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0]
4959 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51]
4961 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0]
4964 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0]
4966 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0]
4970 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55]
4972 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0]
4975 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0]
4977 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0]
4981 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59]
4983 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0]
4986 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0]
4988 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0]
4992 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63]
4994 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0]
4997 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0]
4999 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0]
5015 ; AVX512DQ-BW: # %bb.0:
5016 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5017 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
5018 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
5027 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35]
5029 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0]
5031 ; AVX512DQ-BW-NEXT: movw $-30584, %cx # imm = 0x8888
5034 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0]
5036 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0]
5038 ; AVX512DQ-BW-NEXT: movw $8738, %cx # imm = 0x2222
5044 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39]
5046 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0]
5049 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0]
5051 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0]
5055 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43]
5057 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0]
5060 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0]
5062 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0]
5066 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47]
5068 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0]
5071 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0]
5073 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0]
5077 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51]
5079 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0]
5082 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0]
5084 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0]
5088 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55]
5090 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0]
5093 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0]
5095 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0]
5099 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59]
5101 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0]
5104 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0]
5106 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0]
5110 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63]
5112 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0]
5115 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0]
5117 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0]
5133 ; AVX512DQ-BW-FCP: # %bb.0:
5134 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5135 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
5136 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
5145 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35]
5147 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0]
5149 ; AVX512DQ-BW-FCP-NEXT: movw $-30584, %cx # imm = 0x8888
5152 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0]
5154 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0]
5156 ; AVX512DQ-BW-FCP-NEXT: movw $8738, %cx # imm = 0x2222
5162 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39]
5164 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0]
5167 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0]
5169 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0]
5173 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43]
5175 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0]
5178 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0]
5180 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0]
5184 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47]
5186 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0]
5189 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0]
5191 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0]
5195 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51]
5197 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0]
5200 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0]
5202 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0]
5206 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55]
5208 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0]
5211 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0]
5213 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0]
5217 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59]
5219 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0]
5222 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0]
5224 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0]
5228 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63]
5230 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0]
5233 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0]
5235 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0]
5257 %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5258 %2 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5259 %3 = shufflevector <32 x i16> %in.vec4, <32 x i16> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5260 %4 = shufflevector <32 x i16> %in.vec6, <32 x i16> %in.vec7, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5261 %5 = shufflevector <64 x i16> %1, <64 x i16> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5262 %6 = shufflevector <64 x i16> %3, <64 x i16> %4, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5263 %7 = shufflevector <128 x i16> %5, <128 x i16> %6, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
5264 %interleaved.vec = shufflevector <256 x i16> %7, <256 x i16> poison, <256 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 192, i32 224, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 193, i32 225, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 194, i32 226, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 195, i32 227, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 196, i32 228, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 197, i32 229, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 198, i32 230, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 199, i32 231, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 200, i32 232, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 201, i32 233, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 202, i32 234, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 203, i32 235, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 204, i32 236, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 205, i32 237, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 206, i32 238, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 207, i32 239, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 208, i32 240, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 209, i32 241, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 210, i32 242, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 211, i32 243, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 212, i32 244, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 213, i32 245, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 214, i32 246, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 215, i32 247, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 216, i32 248, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 217, i32 249, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 218, i32 250, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 219, i32 251, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 220, i32 252, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 221, i32 253, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 222, i32 254, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191, i32 223, i32 255>
5271 ; SSE: # %bb.0:
5272 ; SSE-NEXT: subq $776, %rsp # imm = 0x308
5273 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5274 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
5284 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
5286 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
5288 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
5290 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
5291 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,0,0]
5293 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3]
5294 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,0,0]
5296 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1]
5297 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5299 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
5302 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
5303 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3]
5304 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5310 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
5311 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5316 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
5318 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3]
5319 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5324 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
5326 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,0,0]
5328 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0]
5330 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1]
5331 ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5333 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
5336 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
5337 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3]
5338 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5344 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1]
5345 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5348 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
5349 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3]
5350 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5352 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5353 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,0,0]
5355 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
5356 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
5362 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
5366 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
5368 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
5369 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
5370 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5373 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
5375 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1]
5376 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3]
5377 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5383 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1]
5384 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5387 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1]
5388 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[2,3]
5389 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5394 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0]
5395 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
5398 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5399 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
5400 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5403 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5405 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5406 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
5407 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5413 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
5414 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5417 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5418 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
5419 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5423 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
5427 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
5428 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0]
5429 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0]
5435 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
5439 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
5441 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
5442 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
5443 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5446 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
5448 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5449 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3]
5450 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5456 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1]
5457 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5460 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5461 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3]
5462 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5467 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
5468 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0]
5471 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5472 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
5473 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5476 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5478 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5479 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
5480 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5486 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
5487 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5490 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5491 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3]
5492 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5496 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
5500 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
5501 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0]
5502 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0]
5508 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
5512 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
5514 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
5515 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
5516 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5519 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
5521 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5522 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3]
5523 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5529 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1]
5530 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5533 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5534 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3]
5535 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5540 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
5541 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0]
5544 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5545 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
5546 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5549 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5551 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5552 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
5553 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5559 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
5560 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5563 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5564 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3]
5565 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5569 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
5573 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
5574 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0]
5575 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0]
5581 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
5585 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
5587 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
5588 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
5589 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5592 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
5594 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5595 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3]
5596 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5602 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1]
5603 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5606 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5607 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3]
5608 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5613 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
5614 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0]
5617 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5618 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
5619 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5622 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5624 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5625 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
5626 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5632 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
5633 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5636 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5637 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3]
5638 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5642 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
5646 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
5647 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0]
5648 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0]
5654 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
5658 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
5660 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
5661 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
5662 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5665 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
5667 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5668 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3]
5669 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5675 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1]
5676 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5679 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5680 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3]
5681 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5686 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
5687 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0]
5690 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5691 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
5692 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5695 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5697 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5698 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
5699 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5705 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
5706 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5709 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5710 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3]
5715 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
5719 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
5720 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0]
5721 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0]
5727 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
5731 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
5733 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
5734 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
5735 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5738 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
5740 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5741 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3]
5742 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5748 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1]
5749 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5752 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
5753 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3]
5754 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5759 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
5760 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0]
5763 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5764 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
5765 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5768 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5770 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5771 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
5772 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5778 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1]
5781 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
5782 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3]
5785 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5787 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
5790 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5792 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5793 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0]
5794 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,0,0]
5799 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
5803 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
5805 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
5806 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm7[0],xmm11[1]
5809 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
5811 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
5812 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3]
5818 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
5821 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
5822 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3]
5825 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
5827 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5829 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
5830 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
5833 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
5834 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5837 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
5839 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
5840 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
5846 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
5849 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
5850 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
5851 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5862 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5864 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5866 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5868 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5870 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5872 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5876 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5878 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5880 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5882 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5884 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5886 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5888 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5890 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5892 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5894 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5896 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5898 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5900 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5902 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5904 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5906 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5908 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5910 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5912 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5914 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5916 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5918 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5920 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5922 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5924 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5926 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5928 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5930 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5932 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5934 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5936 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5938 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5940 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5942 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5944 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5946 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5948 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5950 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5952 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5954 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5956 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5958 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5960 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5962 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5964 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5966 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5968 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5970 ; SSE-NEXT: addq $776, %rsp # imm = 0x308
5974 ; AVX: # %bb.0:
5975 ; AVX-NEXT: subq $744, %rsp # imm = 0x2E8
5976 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5977 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
5981 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
5982 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
5983 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
5987 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
5988 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1]
5989 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero
5991 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7]
5994 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
5999 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
6000 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,0,1,1]
6002 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
6004 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
6005 ; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6011 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6],ymm13[7]
6019 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
6021 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
6022 ; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6024 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,0,0]
6025 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,1,0,1]
6028 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,1]
6029 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm8[0],zero,xmm8[1],zero
6032 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm6[3],ymm13[4,5,6],ymm6[7]
6038 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1]
6040 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
6041 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7]
6042 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6047 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7]
6054 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
6055 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7]
6056 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6057 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6058 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,0,0]
6059 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,1]
6061 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
6062 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,1,0,1]
6063 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm9[0],zero,xmm9[1],zero
6065 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7]
6066 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
6070 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
6071 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1]
6073 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
6074 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
6075 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6080 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm5[3],ymm9[4,5,6],ymm5[7]
6087 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7]
6089 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
6090 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6092 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
6093 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1]
6096 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
6097 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero
6100 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
6106 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1]
6108 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7]
6110 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
6111 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6116 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6],ymm7[7]
6123 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
6124 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
6125 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6126 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
6127 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0]
6128 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1]
6130 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
6131 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1]
6132 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero
6134 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
6137 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
6142 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
6143 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1]
6145 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
6146 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
6147 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6152 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
6159 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
6160 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
6161 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6163 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,0,0]
6164 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1]
6167 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
6168 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero
6170 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7]
6175 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
6177 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
6178 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
6179 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6185 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
6193 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7]
6194 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
6195 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6196 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6197 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0]
6198 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1]
6202 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6203 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1]
6204 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero
6206 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
6209 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6214 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
6215 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1]
6217 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
6218 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
6219 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6224 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
6231 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
6232 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
6233 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6235 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
6236 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1]
6239 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
6240 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero
6242 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
6247 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
6249 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
6250 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
6251 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6256 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
6263 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
6264 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6265 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6268 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6269 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0]
6270 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1]
6274 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6275 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1]
6276 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero
6278 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
6281 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6286 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
6287 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1]
6289 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
6290 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
6291 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6296 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
6303 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
6304 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
6305 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6307 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
6308 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1]
6311 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
6312 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero
6314 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
6319 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
6321 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
6322 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
6323 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6328 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
6335 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
6336 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6337 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6340 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6341 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0]
6342 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1]
6346 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6347 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1]
6348 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero
6350 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
6353 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6358 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
6359 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1]
6361 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
6362 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
6363 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6368 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
6375 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
6376 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
6377 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6379 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
6380 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1]
6383 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
6384 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero
6386 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
6391 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
6393 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
6394 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
6400 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
6407 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
6408 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6409 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6412 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
6413 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0]
6414 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1]
6418 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6419 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1]
6420 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero
6422 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
6425 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6430 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
6431 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,0,1,1]
6433 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
6434 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
6435 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6440 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
6447 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7]
6448 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7]
6449 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6451 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
6452 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1]
6455 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
6456 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero
6458 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
6463 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
6465 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
6466 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
6467 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6472 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
6479 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
6480 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6483 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
6484 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0]
6485 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1]
6489 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
6490 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,0,1]
6491 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm13[0],zero,xmm13[1],zero
6493 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
6496 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
6501 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6502 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1]
6504 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
6505 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7]
6510 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5,6],ymm8[7]
6517 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
6518 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
6520 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
6521 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
6524 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1]
6525 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero
6527 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7]
6532 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1]
6534 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
6535 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
6540 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
6547 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
6548 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
6549 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
6555 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6557 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6559 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6561 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6565 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6567 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6569 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6571 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6573 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6575 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6577 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6579 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6581 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6583 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6585 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6587 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6589 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6591 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6593 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6595 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6597 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6599 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6601 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6603 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6605 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6607 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6609 ; AVX-NEXT: addq $744, %rsp # imm = 0x2E8
6614 ; AVX2: # %bb.0:
6615 ; AVX2-NEXT: subq $712, %rsp # imm = 0x2C8
6616 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
6617 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
6622 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
6625 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
6626 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6627 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6630 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6633 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
6634 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm7[0],xmm14[1],xmm7[1]
6635 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
6636 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7]
6637 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6643 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,0,2,1]
6644 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
6645 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7]
6646 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6653 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
6654 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
6656 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
6657 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
6658 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7]
6659 ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6661 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
6663 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
6664 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
6665 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6666 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6667 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
6668 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
6669 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
6670 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
6671 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
6672 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
6673 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
6674 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
6675 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6679 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
6680 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
6681 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
6682 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6689 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm10[0],xmm4[1],xmm10[1]
6690 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6692 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
6693 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
6694 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
6695 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6698 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
6700 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
6701 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
6702 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6703 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
6704 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6705 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6706 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
6710 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
6711 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
6712 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
6713 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
6714 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
6715 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6718 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
6719 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
6720 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
6721 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6725 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
6726 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6728 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6729 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
6730 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
6731 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6733 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
6735 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6736 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6737 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6740 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6743 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
6744 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
6745 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
6748 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
6751 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6752 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
6753 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
6754 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7]
6755 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6758 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6759 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
6760 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
6761 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6766 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6767 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
6768 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6769 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
6770 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
6771 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6774 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
6775 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6776 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6777 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6782 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
6783 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11]
6784 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
6785 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,2,2,3]
6790 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11]
6791 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
6792 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5]
6794 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
6795 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6798 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
6800 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
6801 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6806 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6807 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
6808 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6810 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
6811 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6814 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
6816 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6817 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6822 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11]
6823 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
6824 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
6825 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,3]
6830 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
6831 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
6832 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5]
6834 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
6835 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6838 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
6840 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
6846 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5]
6847 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
6848 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6850 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
6851 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6854 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
6856 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6857 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6862 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
6863 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
6864 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5]
6865 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,2,2,3]
6870 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11]
6871 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
6872 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
6874 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3],ymm14[4,5],ymm6[6,7]
6875 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6878 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,2,2,3]
6880 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
6881 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6886 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5]
6887 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
6888 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
6890 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
6893 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
6895 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
6900 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11]
6901 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
6902 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
6903 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
6908 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
6909 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
6910 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
6912 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
6915 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,2,2,3]
6917 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
6922 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
6923 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
6924 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
6926 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
6929 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
6931 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
6932 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
6939 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6941 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6943 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6945 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6949 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6951 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6953 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6955 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6957 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6959 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6961 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6963 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6965 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6967 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6969 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6971 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6973 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6975 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6977 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6979 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6981 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6983 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6985 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6987 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6989 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6991 ; AVX2-NEXT: addq $712, %rsp # imm = 0x2C8
6996 ; AVX2-FP: # %bb.0:
6997 ; AVX2-FP-NEXT: subq $712, %rsp # imm = 0x2C8
6998 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6999 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7004 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
7007 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
7008 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
7009 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7012 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
7015 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
7016 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm7[0],xmm14[1],xmm7[1]
7017 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
7018 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7]
7019 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7025 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,0,2,1]
7026 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
7027 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7]
7028 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7035 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
7036 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
7038 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
7039 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
7040 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7]
7041 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7043 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
7045 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
7046 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
7047 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7048 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7049 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
7050 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
7051 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
7052 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
7053 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
7054 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
7055 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
7056 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
7057 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7061 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
7062 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
7063 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
7064 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7071 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm10[0],xmm4[1],xmm10[1]
7072 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7074 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
7075 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
7076 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
7077 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7080 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7082 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
7083 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
7084 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7085 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
7086 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7087 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
7088 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
7092 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
7093 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
7094 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
7095 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
7096 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
7097 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7100 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7101 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
7102 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
7103 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7107 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7108 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7110 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7111 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
7112 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
7113 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7115 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
7117 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
7118 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7119 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7122 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7125 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7126 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
7127 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
7130 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
7133 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
7134 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
7135 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
7136 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7]
7137 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7140 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7141 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
7142 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
7143 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7148 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7149 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7150 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7151 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
7152 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
7153 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7156 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
7157 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
7158 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7159 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7164 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
7165 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11]
7166 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
7167 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,2,2,3]
7172 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11]
7173 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
7174 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5]
7176 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
7177 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7180 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
7182 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
7183 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7188 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
7189 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
7190 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7192 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
7193 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7196 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
7198 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7199 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7204 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11]
7205 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
7206 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
7207 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,3]
7212 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
7213 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
7214 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5]
7216 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
7217 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7220 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
7222 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
7228 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5]
7229 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
7230 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7232 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
7233 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7236 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
7238 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7239 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7244 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
7245 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
7246 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5]
7247 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,2,2,3]
7252 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11]
7253 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
7254 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
7256 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3],ymm14[4,5],ymm6[6,7]
7257 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7260 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,2,2,3]
7262 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
7263 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7268 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5]
7269 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
7270 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
7272 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
7275 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
7277 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
7282 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11]
7283 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
7284 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
7285 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
7290 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
7291 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
7292 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
7294 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
7297 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,2,2,3]
7299 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
7304 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
7305 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
7306 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
7308 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
7311 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
7313 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
7314 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7321 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7323 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7325 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7327 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7331 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7333 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7335 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7337 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7339 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7341 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7343 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7345 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7347 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7349 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7351 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7353 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7355 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7357 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7359 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7361 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7363 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7365 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7367 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7369 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7371 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7373 ; AVX2-FP-NEXT: addq $712, %rsp # imm = 0x2C8
7378 ; AVX2-FCP: # %bb.0:
7379 ; AVX2-FCP-NEXT: subq $776, %rsp # imm = 0x308
7380 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7381 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7384 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
7385 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1]
7389 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
7390 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,0,1]
7392 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
7395 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
7396 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0]
7400 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
7401 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,1,1,0]
7403 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7]
7404 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7]
7405 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7406 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3]
7409 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
7410 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,2,3,3,3,3,0,0]
7413 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
7414 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
7415 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7417 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,0,0,1,1]
7420 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,0,1]
7422 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
7425 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,1,1,1,1,0,0]
7428 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,1,0]
7430 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7432 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
7433 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7437 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
7439 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,2,3,3,3,3,0,0]
7442 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
7443 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7444 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7445 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3]
7447 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
7449 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
7454 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
7456 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
7457 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,1,1,0]
7459 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
7460 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
7461 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7462 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3]
7465 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
7466 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,2,3,3,3,3,0,0]
7470 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
7471 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
7472 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7475 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,0,1,1]
7477 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,0,1]
7479 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
7482 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,1,1,1,1,0,0]
7485 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7486 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
7487 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7490 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
7493 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
7495 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7496 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7500 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7502 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
7504 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7]
7509 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
7511 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
7513 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7]
7514 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
7515 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7516 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3]
7519 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
7520 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,3,3,3,3,0,0]
7524 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7525 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
7526 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7529 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,0,0,1,1]
7531 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,0,1]
7533 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
7536 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,1,1,1,1,0,0]
7538 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,1,1,0]
7540 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7541 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
7542 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7545 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
7548 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
7549 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7550 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7553 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7557 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
7559 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7]
7562 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
7566 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
7568 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
7569 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7]
7570 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7571 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,2,2,2,0,0,3,3]
7574 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
7575 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,3,3,3,3,0,0]
7578 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
7579 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
7580 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7583 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,0,0,1,1]
7585 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,0,1]
7587 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
7590 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,1,1,1,1,0,0]
7592 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,1,0]
7594 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7595 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
7596 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7599 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
7602 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
7603 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7604 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7609 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
7610 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5]
7612 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
7613 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7]
7615 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
7620 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
7623 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
7626 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4],ymm5[5],ymm9[6,7]
7627 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7]
7628 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7629 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
7633 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3],ymm9[4,5,6],ymm4[7]
7638 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
7639 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
7640 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7643 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,4,4,4,4,6,5]
7645 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7]
7647 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7]
7654 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
7655 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
7656 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7657 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
7661 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7]
7664 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
7665 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
7666 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7671 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
7673 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
7675 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7]
7680 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
7682 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
7685 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7]
7686 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7]
7687 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7688 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7]
7692 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7]
7697 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7]
7698 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
7699 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7702 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5]
7704 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,4,5,4,5,5,7]
7706 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
7713 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7714 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
7715 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7718 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
7721 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
7722 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7728 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
7730 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
7732 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7]
7737 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
7739 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11]
7742 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7]
7743 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
7744 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7745 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7]
7749 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7]
7754 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
7755 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
7756 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7759 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,4,4,4,4,6,5]
7761 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7]
7763 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
7769 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
7770 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
7771 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7774 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
7778 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
7779 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7780 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7785 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
7787 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
7788 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7]
7790 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7]
7795 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
7798 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
7801 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7]
7802 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7]
7803 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7]
7807 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
7812 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
7813 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7]
7816 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5]
7818 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7]
7820 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
7826 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
7827 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7]
7828 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7]
7831 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
7835 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7]
7836 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
7837 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7842 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7844 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7846 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7848 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7852 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7854 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7856 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7858 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7860 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7862 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7864 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7866 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7868 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7870 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7872 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7874 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7876 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7878 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7880 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7882 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7884 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7886 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7888 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7890 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7892 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7894 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7896 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7898 ; AVX2-FCP-NEXT: addq $776, %rsp # imm = 0x308
7903 ; AVX512: # %bb.0:
7904 ; AVX512-NEXT: subq $504, %rsp # imm = 0x1F8
7905 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
7906 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
7908 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7911 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7915 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7918 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
7920 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
7921 ; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888
7924 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7925 ; AVX512-NEXT: movw $8738, %r11w # imm = 0x2222
7929 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
7932 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
7933 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
7935 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
7937 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7942 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
7945 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
7948 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7953 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7960 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
7963 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
7966 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7971 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7974 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
7977 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
7980 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7985 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
7987 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
7995 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8000 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8001 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8004 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8009 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8012 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
8017 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
8021 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11]
8034 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
8037 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11]
8046 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
8049 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
8061 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
8063 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8073 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8074 ; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
8075 ; AVX512-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
8077 ; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload
8078 ; AVX512-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
8083 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
8085 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
8089 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8093 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
8101 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8105 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
8113 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8117 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
8124 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
8125 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
8130 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8132 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8133 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8135 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8137 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8139 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8141 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8143 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8145 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8155 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
8172 ; AVX512-NEXT: addq $504, %rsp # imm = 0x1F8
8177 ; AVX512-FCP: # %bb.0:
8178 ; AVX512-FCP-NEXT: subq $2312, %rsp # imm = 0x908
8179 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8180 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8184 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8187 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8189 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8194 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8197 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
8199 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8202 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
8204 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8207 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
8209 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8212 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
8214 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8217 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8220 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8223 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8226 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8229 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8231 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8234 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
8236 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8239 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
8240 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8243 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
8244 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8247 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8250 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8252 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8254 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8257 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
8261 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
8265 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
8271 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[8],ymm8[8],ymm5[9],ymm8[9],ymm5[10],ymm8[10],ymm5[11],ymm8[11]
8274 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
8277 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8279 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8282 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
8288 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
8289 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8291 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8293 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8297 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8298 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8300 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8302 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8304 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8306 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8308 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8311 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
8315 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
8319 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
8323 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
8333 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
8339 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8345 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
8346 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8348 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8353 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8354 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8356 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8359 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
8363 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11]
8367 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11]
8371 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
8379 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8380 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
8382 ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload
8383 ; AVX512-FCP-NEXT: # xmm8 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
8389 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8390 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
8391 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8397 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8398 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8399 ; AVX512-FCP-NEXT: movw $-30584, %ax # imm = 0x8888
8401 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15]
8402 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
8403 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15]
8404 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload
8406 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
8407 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload
8414 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8419 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8422 ; AVX512-FCP-NEXT: movw $8738, %ax # imm = 0x2222
8425 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
8427 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 {%k1} # 64-byte Folded Reload
8428 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
8429 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 {%k1} # 64-byte Folded Reload
8430 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
8431 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 {%k1} # 64-byte Folded Reload
8433 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 {%k1} # 64-byte Folded Reload
8442 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11]
8443 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
8444 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k2} # 64-byte Folded Reload
8445 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11]
8446 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
8447 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 {%k2} # 64-byte Folded Reload
8448 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
8449 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 {%k2} # 64-byte Folded Reload
8450 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload
8451 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 {%k2} # 64-byte Folded Reload
8452 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 64-byte Folded Reload
8453 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 {%k2} # 64-byte Folded Reload
8461 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0]
8463 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8465 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0]
8467 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8469 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0]
8471 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8474 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8476 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8480 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8482 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8486 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8488 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8492 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8494 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8498 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8500 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8504 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
8506 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8522 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8527 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8531 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8548 ; AVX512-FCP-NEXT: addq $2312, %rsp # imm = 0x908
8553 ; AVX512DQ: # %bb.0:
8554 ; AVX512DQ-NEXT: subq $504, %rsp # imm = 0x1F8
8555 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
8556 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
8558 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8561 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8565 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8568 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
8570 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
8571 ; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888
8574 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8575 ; AVX512DQ-NEXT: movw $8738, %r11w # imm = 0x2222
8579 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
8582 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
8583 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
8585 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
8587 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8592 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
8595 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
8598 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8603 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8610 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
8613 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
8616 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8621 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8624 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
8627 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
8630 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8635 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
8637 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
8645 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8650 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8651 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8654 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8659 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8662 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
8667 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
8671 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11]
8684 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
8687 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11]
8696 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
8699 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
8711 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
8713 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8723 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8724 ; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
8725 ; AVX512DQ-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
8727 ; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload
8728 ; AVX512DQ-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
8733 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
8735 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
8739 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8743 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
8751 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8755 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
8763 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8767 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
8774 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
8775 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
8780 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8782 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8783 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8785 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8787 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8789 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8791 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8793 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8795 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8805 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
8822 ; AVX512DQ-NEXT: addq $504, %rsp # imm = 0x1F8
8827 ; AVX512DQ-FCP: # %bb.0:
8828 ; AVX512DQ-FCP-NEXT: subq $2312, %rsp # imm = 0x908
8829 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8830 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8834 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8837 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8839 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8844 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8847 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
8849 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8852 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
8854 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8857 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
8859 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8862 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
8864 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8867 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8870 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8873 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8876 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8879 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8881 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8884 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
8886 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8889 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
8890 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8893 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
8894 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8897 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8900 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8902 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8904 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8907 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
8911 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
8915 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
8921 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[8],ymm8[8],ymm5[9],ymm8[9],ymm5[10],ymm8[10],ymm5[11],ymm8[11]
8924 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
8927 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8929 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8932 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
8938 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
8939 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8941 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8943 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8947 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8948 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8950 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8952 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8954 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8956 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8958 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8961 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
8965 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
8969 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
8973 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
8983 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
8989 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8995 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
8996 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8998 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9003 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
9004 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9006 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9009 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
9013 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11]
9017 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11]
9021 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
9029 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9030 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
9032 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload
9033 ; AVX512DQ-FCP-NEXT: # xmm8 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
9039 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9040 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
9041 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9047 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9048 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9049 ; AVX512DQ-FCP-NEXT: movw $-30584, %ax # imm = 0x8888
9051 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15]
9052 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
9053 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15]
9054 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload
9056 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
9057 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload
9064 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9069 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9072 ; AVX512DQ-FCP-NEXT: movw $8738, %ax # imm = 0x2222
9075 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
9077 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 {%k1} # 64-byte Folded Reload
9078 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
9079 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 {%k1} # 64-byte Folded Reload
9080 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
9081 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 {%k1} # 64-byte Folded Reload
9083 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 {%k1} # 64-byte Folded Reload
9092 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11]
9093 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
9094 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k2} # 64-byte Folded Reload
9095 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11]
9096 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
9097 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 {%k2} # 64-byte Folded Reload
9098 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
9099 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 {%k2} # 64-byte Folded Reload
9100 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload
9101 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 {%k2} # 64-byte Folded Reload
9102 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 64-byte Folded Reload
9103 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 {%k2} # 64-byte Folded Reload
9111 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0]
9113 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9115 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0]
9117 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9119 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0]
9121 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9124 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9126 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9130 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9132 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9136 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9138 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9142 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9144 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9148 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9150 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9154 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
9156 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9172 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9177 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9181 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9198 ; AVX512DQ-FCP-NEXT: addq $2312, %rsp # imm = 0x908
9203 ; AVX512BW: # %bb.0:
9204 ; AVX512BW-NEXT: subq $2056, %rsp # imm = 0x808
9205 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
9206 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
9213 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39]
9217 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9218 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0]
9221 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9222 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35]
9225 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9226 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0]
9229 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9230 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47]
9233 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9234 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0]
9237 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9238 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43]
9241 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9242 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0]
9245 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9246 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55]
9249 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9250 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0]
9253 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9254 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51]
9257 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9258 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0]
9259 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63]
9262 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9263 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59]
9265 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9268 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9270 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9272 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9274 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9276 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9278 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9280 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9282 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9284 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9285 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0]
9288 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0]
9290 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9294 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9296 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9298 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9300 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9302 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9304 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9309 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0]
9312 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0]
9315 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0]
9318 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0]
9321 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0]
9324 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0]
9327 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0]
9330 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0]
9335 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9337 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9341 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9343 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9349 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0]
9352 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0]
9355 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0]
9358 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0]
9361 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0]
9364 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0]
9367 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0]
9370 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0]
9382 ; AVX512BW-NEXT: movw $-30584, %ax # imm = 0x8888
9384 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9385 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9387 ; AVX512BW-NEXT: movw $8738, %ax # imm = 0x2222
9393 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9394 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9398 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9399 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9403 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9404 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9408 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9409 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9413 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9414 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9418 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9422 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9423 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9427 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9428 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9430 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9433 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9434 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9436 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9439 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9440 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9445 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9446 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9448 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9451 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9452 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9454 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9457 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9458 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9462 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9466 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9470 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
9487 ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808
9492 ; AVX512BW-FCP: # %bb.0:
9493 ; AVX512BW-FCP-NEXT: subq $2056, %rsp # imm = 0x808
9494 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9495 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
9502 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39]
9506 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9507 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0]
9510 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9511 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35]
9514 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9515 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0]
9518 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9519 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47]
9522 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9523 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0]
9526 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9527 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43]
9530 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9531 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0]
9534 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9535 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55]
9538 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9539 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0]
9542 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9543 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51]
9546 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9547 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0]
9548 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63]
9551 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9552 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59]
9554 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9557 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9559 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9561 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9563 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9565 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9567 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9569 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9571 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9573 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9574 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0]
9577 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0]
9579 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9583 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9585 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9587 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9589 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9591 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9593 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9598 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0]
9601 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0]
9604 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0]
9607 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0]
9610 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0]
9613 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0]
9616 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0]
9619 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0]
9624 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9626 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9630 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9632 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9638 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0]
9641 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0]
9644 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0]
9647 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0]
9650 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0]
9653 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0]
9656 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0]
9659 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0]
9671 ; AVX512BW-FCP-NEXT: movw $-30584, %ax # imm = 0x8888
9673 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9674 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9676 ; AVX512BW-FCP-NEXT: movw $8738, %ax # imm = 0x2222
9682 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9683 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9687 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9688 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9692 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9693 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9697 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9698 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9702 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9703 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9707 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9711 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9712 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9716 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9717 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9719 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9722 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9723 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9725 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9728 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9729 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9734 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9735 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9737 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9740 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9741 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9743 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9746 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9747 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
9751 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9755 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9759 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9776 ; AVX512BW-FCP-NEXT: addq $2056, %rsp # imm = 0x808
9781 ; AVX512DQ-BW: # %bb.0:
9782 ; AVX512DQ-BW-NEXT: subq $2056, %rsp # imm = 0x808
9783 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
9784 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
9791 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39]
9795 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9796 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0]
9799 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9800 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35]
9803 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9804 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0]
9807 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9808 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47]
9811 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9812 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0]
9815 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9816 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43]
9819 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9820 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0]
9823 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9824 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55]
9827 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9828 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0]
9831 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9832 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51]
9835 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9836 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0]
9837 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63]
9840 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9841 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59]
9843 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9846 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9848 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9850 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9852 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9854 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9856 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9858 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9860 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9862 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9863 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0]
9866 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0]
9868 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9872 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9874 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9876 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9878 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9880 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9882 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9887 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0]
9890 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0]
9893 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0]
9896 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0]
9899 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0]
9902 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0]
9905 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0]
9908 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0]
9913 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9915 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9919 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9921 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9927 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0]
9930 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0]
9933 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0]
9936 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0]
9939 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0]
9942 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0]
9945 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0]
9948 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0]
9960 ; AVX512DQ-BW-NEXT: movw $-30584, %ax # imm = 0x8888
9962 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9963 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9965 ; AVX512DQ-BW-NEXT: movw $8738, %ax # imm = 0x2222
9971 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9972 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9976 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9977 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9981 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9982 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9986 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9987 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9991 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9992 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9996 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10000 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10001 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10005 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10006 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10008 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10011 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10012 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10014 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10017 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10018 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10023 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10024 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10026 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10029 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10030 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10032 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10035 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10036 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10040 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10044 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10048 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
10065 ; AVX512DQ-BW-NEXT: addq $2056, %rsp # imm = 0x808
10070 ; AVX512DQ-BW-FCP: # %bb.0:
10071 ; AVX512DQ-BW-FCP-NEXT: subq $2056, %rsp # imm = 0x808
10072 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
10073 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
10080 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39]
10084 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10085 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0]
10088 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10089 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35]
10092 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10093 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0]
10096 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10097 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47]
10100 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10101 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0]
10104 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10105 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43]
10108 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10109 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0]
10112 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10113 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55]
10116 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10117 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0]
10120 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10121 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51]
10124 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10125 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0]
10126 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63]
10129 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10130 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59]
10132 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10135 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10137 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10139 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10141 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10143 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10145 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10147 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10149 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10151 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10152 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0]
10155 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0]
10157 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10161 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10163 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10165 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10167 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10169 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10171 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10176 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0]
10179 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0]
10182 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0]
10185 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0]
10188 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0]
10191 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0]
10194 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0]
10197 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0]
10202 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10204 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10208 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10210 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10216 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0]
10219 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0]
10222 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0]
10225 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0]
10228 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0]
10231 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0]
10234 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0]
10237 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0]
10249 ; AVX512DQ-BW-FCP-NEXT: movw $-30584, %ax # imm = 0x8888
10251 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10252 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10254 ; AVX512DQ-BW-FCP-NEXT: movw $8738, %ax # imm = 0x2222
10260 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10261 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10265 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10266 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10270 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10271 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10275 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10276 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10280 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10281 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10285 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10289 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10290 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10294 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10295 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10297 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10300 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10301 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10303 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10306 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10307 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10312 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10313 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10315 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10318 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10319 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10321 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10324 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10325 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10329 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10333 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10337 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
10354 ; AVX512DQ-BW-FCP-NEXT: addq $2056, %rsp # imm = 0x808
10365 %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
10366 %2 = shufflevector <64 x i16> %in.vec2, <64 x i16> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
10367 %3 = shufflevector <64 x i16> %in.vec4, <64 x i16> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
10368 %4 = shufflevector <64 x i16> %in.vec6, <64 x i16> %in.vec7, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
10369 %5 = shufflevector <128 x i16> %1, <128 x i16> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
10370 %6 = shufflevector <128 x i16> %3, <128 x i16> %4, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
10371 %7 = shufflevector <256 x i16> %5, <256 x i16> %6, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383, i32 384, i32 385, i32 386, i32 387, i32 388, i32 389, i32 390, i32 391, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 416, i32 417, i32 418, i32 419, i32 420, i32 421, i32 422, i32 423, i32 424, i32 425, i32 426, i32 427, i32 428, i32 429, i32 430, i32 431, i32 432, i32 433, i32 434, i32 435, i32 436, i32 437, i32 438, i32 439, i32 440, i32 441, i32 442, i32 443, i32 444, i32 445, i32 446, i32 447, i32 448, i32 449, i32 450, i32 451, i32 452, i32 453, i32 454, i32 455, i32 456, i32 457, i32 458, i32 459, i32 460, i32 461, i32 462, i32 463, i32 464, i32 465, i32 466, i32 467, i32 468, i32 469, i32 470, i32 471, i32 472, i32 473, i32 474, i32 475, i32 476, i32 477, i32 478, i32 479, i32 480, i32 481, i32 482, i32 483, i32 484, i32 485, i32 486, i32 487, i32 488, i32 489, i32 490, i32 491, i32 492, i32 493, i32 494, i32 495, i32 496, i32 497, i32 498, i32 499, i32 500, i32 501, i32 502, i32 503, i32 504, i32 505, i32 506, i32 507, i32 508, i32 509, i32 510, i32 511>
10372 %interleaved.vec = shufflevector <512 x i16> %7, <512 x i16> poison, <512 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 384, i32 448, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 385, i32 449, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 386, i32 450, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 387, i32 451, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 388, i32 452, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 389, i32 453, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 390, i32 454, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 391, i32 455, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 392, i32 456, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 393, i32 457, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 394, i32 458, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 395, i32 459, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 396, i32 460, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 397, i32 461, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 398, i32 462, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 399, i32 463, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 400, i32 464, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 401, i32 465, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 402, i32 466, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 403, i32 467, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 404, i32 468, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 405, i32 469, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 406, i32 470, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 407, i32 471, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 408, i32 472, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 409, i32 473, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 410, i32 474, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 411, i32 475, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 412, i32 476, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 413, i32 477, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 414, i32 478, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 415, i32 479, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 416, i32 480, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 417, i32 481, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 418, i32 482, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 419, i32 483, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 420, i32 484, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 421, i32 485, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 422, i32 486, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 423, i32 487, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 424, i32 488, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 425, i32 489, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 426, i32 490, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 427, i32 491, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 428, i32 492, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 429, i32 493, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 430, i32 494, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 431, i32 495, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 432, i32 496, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 433, i32 497, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 434, i32 498, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 435, i32 499, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 436, i32 500, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 437, i32 501, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 438, i32 502, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 439, i32 503, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 440, i32 504, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 441, i32 505, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 442, i32 506, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 443, i32 507, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 444, i32 508, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 445, i32 509, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 446, i32 510, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383, i32 447, i32 511>